# Continuous Control

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from unityagents import UnityEnvironment
import collections
from multiprocessing import Process
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [2]:
env = UnityEnvironment(file_name="C:/Users/gabyc/Desktop/Reinforcment_TP/deep-reinforcement-learning/p3_collab-compet/Tennis_Windows_x86_64/Tennis.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


# Network

In [105]:
class Policy(nn.Module):
    # Actor network 
    def __init__(self,input_size,nb_action):
        super(Policy, self).__init__()
        self.nb_action = nb_action
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.fc1 = nn.Linear(input_size,512)
        self.fc2 = nn.Linear(512,256)
        self.fc3 = nn.Linear(256,nb_action)
        self.sigma = nn.Linear(256,nb_action)
        
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = F.tanh(self.fc3(x)) # Tanh because action_values between -1 and 1.

        # Making stochastic policy
        #log_sigma = -torch.relu(self.sigma(x))
        #sigma = torch.exp(log_sigma)
        sigma = torch.ones(self.nb_action,requires_grad=False).to(self.device)/2 # Variance of 0.5 for each action
        m = torch.distributions.normal.Normal(mu,sigma,False)# False, whereas constraint on mu = 0
        return m

In [106]:
class Critic(nn.Module):
    # Critic network 
    def __init__(self,input_size):
        super(Critic, self).__init__()
        
        self.fc1 = nn.Linear(input_size,512)
        self.fc2 = nn.Linear(512,256)
        self.fc3 = nn.Linear(256,1) # 1 output -> Value estimate
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return  F.leaky_relu(self.fc3(x)) 

# Training

In [107]:
def New_prob(policy,states,actions,device):
    # The Gradient FLOW on action
    # The Gradient fon't FLOW on state 
    # No Clipping.
    Tab = []
    Action_sample_tab = []
    m = policy(states[0])
    
    proba = m.log_prob(actions[0])

    # STORE
    Tab.append(proba)
    Action_sample_tab.append(actions[0])
    
    # Loop over the state and action (a,s)
    for state_iter,action_iter in zip(states[1:],actions[1:]):
        m = policy(state_iter)
        proba = m.log_prob(action_iter) # Prob on the previous action but new policy
   
        # STORE
        Tab.append(proba)
        Action_sample_tab.append(action_iter)

    return torch.stack(Tab),torch.stack(Action_sample_tab)

In [108]:
def clipped_surrogate(Delta_t,critic,optimizer,device,policy, old_probs,actions, states,state_full, rewards,batch_size,
                      discount = 0.995, epsilon=0.1, beta=0.01):
    
    # Convert REWARD TO REWARD FUTURE
    rewards = np.asarray(rewards)

    reward_futur = np.zeros(rewards.shape[0])
    longueur = rewards.shape[0] - 1
    reward_futur[longueur] = rewards[longueur]
    new_discount = 0
    for i in range(1,rewards.shape[0]):
        new_discount = discount**(longueur-i) 
        reward_futur[longueur-i] = reward_futur[longueur-(i-1)] + rewards[longueur-i]*new_discount
    
    
    #Normalize At
    Delta_t = Delta_t.detach()
    Delta_t = Delta_t[:,None].repeat(1, old_probs.shape[1])
    Delta_t = (Delta_t- Delta_t.mean())/(Delta_t.std() + 1e-10)
    
    new_prob,action_sample = New_prob(policy, states,actions,device)
    
    # Compute each 
    Fraction = torch.exp(new_prob-(old_probs+1e-10))
    Cote1 = Delta_t*Fraction 
    Cote2 = Delta_t*torch.clamp(Fraction, 1-epsilon, 1+epsilon) 
    Cote1 = Cote1[:, :,None]
    Cote2 = Cote2[:, :,None]
    comp = torch.cat((Cote1, Cote2),2)
    Gradient = torch.min(comp,2)[0].to(device) # Surrogate function


    entropy = -(torch.exp(new_prob)*old_probs+1.e-10)+ \
        (1.0-torch.exp(new_prob))*(1.0-old_probs+1.e-10) # Entropy to enhance exploration

    writer.add_scalar('Entropy',torch.mean(beta*(entropy)),iteration_all)
    writer.add_scalar('Gradient',torch.mean(Gradient),iteration_all)
    
    MSE = TD_Training(critic,optimizer,state_full,reward_futur,discount,device) # Critic network training
    writer.add_scalar('Loss/Critic',MSE,iteration_all)

    return -torch.mean(beta*(entropy) + Gradient)
        

In [109]:
def TD_Training(Critic,optimizereer,states,reward,discount,device):
    # Function for training the critic
    states = states.detach()
    reward = torch.from_numpy(reward).detach()
    value_loss = []
    for st in states:
        Valuet = Critic(st.flatten())
        value_loss.append(Valuet)
    
    Loss = 0.5*(discount*reward.to(device)[:,None] - torch.stack(value_loss)).pow(2).mean() # Simple MSE Loss
    optimizereer.zero_grad()
    Loss.backward()
    optimizereer.step()
    return Loss

# Evaluation

In [110]:
def collect_trajectories_double(env,env_info,policy,policy2,device,tmax):
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    state = env_info.vector_observations # get the current state (for each agent)
    states_tab = []
    states_tab1 , action_tab1, reward_tab1, prob_tab1,done1 = [],[],[], [], []
    states_tab2 , action_tab2, reward_tab2, prob_tab2,done2 = [],[],[], [], []
    t = 0
    reward_episode = []
    while t < tmax:
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        #state = torch.from_numpy(state).to(device)
        score = np.zeros(2)
        while True:
            policy.eval()
            policy2.eval()
            with torch.no_grad(): # Everything with torch no grad.
                m = policy(torch.from_numpy(state[0]).to(device))
                m2 = policy2(torch.from_numpy(state[1]).to(device))
                

                # Sample maybe on gradient as to check that
                sample = m.sample()
                sample2 = m2.sample()
                action_tab1.append(sample)
                action_tab2.append(sample2)# No clip and store
                states_tab.append(torch.from_numpy(state))
                #states_tab.append(torch.cat((torch.from_numpy(state),sample[:,None],sample2[:,None],),axis=1))

                # Proba not on clip and detach from Gradient.
                proba = m.log_prob(sample)
                proba2 = m2.log_prob(sample2)
                #proba = torch.exp(proba) #Proba on CUDA no detach

                # Interact with the environment 
                sample = torch.clip(sample.detach().cpu(), -1, 1) # CLIP BEFORE TAKING THE PROBA OR AFTER?
                sample = sample.numpy()

                sample2 = torch.clip(sample2.detach().cpu(), -1, 1) # CLIP BEFORE TAKING THE PROBA OR AFTER?
                sample2 = sample2.numpy()


                action_env = np.concatenate([np.expand_dims(sample,axis=0),np.expand_dims(sample2,axis=0)],axis=0)

                # Step the environment
                env_info = env.step(action_env)[brain_name]           # send all actions to the environment
                next_states = env_info.vector_observations         # get next state (for each agent)
                rewards = env_info.rewards                         # get reward (for each agent)
                dones = env_info.local_done                        # see if episode finished           
 
                score += rewards

                # Store values
                prob_tab1.append(proba)
                prob_tab2.append(proba2)

                reward_tab1.append(np.asarray(rewards)[0])
                reward_tab2.append(np.asarray(rewards)[1])
                states_tab1.append(torch.from_numpy(state[0]).to(device))
                states_tab2.append(torch.from_numpy(state[1]).to(device))

                # BREAK IF END OF THE EPISODE
                if np.any(dones):                                  # exit loop if episode finished
                    reward_episode.append(score)
                    break
                if t >= tmax:
                    reward_episode.append(score)
                    break
                state = next_states
                t +=1
            
    return states_tab, states_tab1 , action_tab1, reward_tab1, prob_tab1,done1, states_tab2 , action_tab2, reward_tab2, prob_tab2,done2,reward_episode

In [111]:
def TD_evaluation(Critic,states,reward,discount,device):
    # Calculate TD error during the evaluation step
    Delta_t = []
    Tab = []
    Critic.eval()
    with torch.no_grad(): 
        Valuet = Critic(states[0].flatten())
    
        for rw,st in zip(reward[0:],states[1:]):
            Valuetplus1 = Critic(st.flatten())
            Tab.append(Valuetplus1)

            TD_error = torch.from_numpy(np.asarray(rw)).to(device) + discount*Valuetplus1[0] - Valuet[0] #TD ERROR
            Delta_t.append(TD_error)
            
            Valuet = Valuetplus1
    writer.add_histogram('Values',torch.stack(Tab),e)
    return torch.stack(Delta_t)

In [112]:
def GAE_evaluation(Delta_t,discount,lambd):
    # GAE Function adapted from https://github.com/numblr/drlnd-cocontrol
    flipped = torch.flip(Delta_t, dims=(0,))
    result = torch.zeros_like(flipped)
    result[0] = flipped[0]
    for i in range(1, flipped.size()[0]):
        result[i] = discount * lambd * result[i-1] + flipped[i]

    return torch.flip(result, dims=(0,))

# Launch Main code

In [128]:
# get the default brain
from collections import deque

brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]  
states = env_info.vector_observations # get the current state (for each agent
num_agents = len(states)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
nb_states = len(states[0])
action_size = brain.vector_action_space_size
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

policy = Policy(nb_states,action_size).to(device) # Policy network
policy.load_state_dict(torch.load("PPO_actor_stable.pth"))
optimizer = optim.Adam(policy.parameters(), lr=1e-4)
critic = Critic(nb_states*2).to(device) # Critic network
critic.load_state_dict(torch.load("PPO_critic_stable.pth"))
optimizer_c = optim.Adam(critic.parameters(), lr=1e-4)

policy2 = Policy(nb_states,action_size).to(device) # Policy network
policy2.load_state_dict(torch.load("PPO_actor2_stable.pth"))
optimizer2 = optim.Adam(policy2.parameters(), lr=1e-4)
critic2 = Critic(nb_states*2).to(device) # Critic network
critic2.load_state_dict(torch.load("PPO_critic2_stable.pth"))
optimizer_c2 = optim.Adam(critic2.parameters(), lr=1e-4)
# 1e-4 work well
writer = SummaryWriter()

In [None]:
###################################################### MAIN_CODE #################################################
# training loop max iterations
episode = 5000

tmax = 2500 # 2500
discount_rate = .99
epsilon = 0.1
beta = .01
SGD_epoch = 2
batch_size = 512 #128
lambd = 0.95

# keep track of progress
aleatoire = True
mean_rewards = []
writer.add_text("CONFIG","Critic_nbstate*2, 1e-4, after loading" + str(aleatoire) + "tmax :" + str(tmax) + "batch_size :" + str(batch_size) + "discount_rate :" + str(discount_rate) + "epsilon" + str(epsilon)+ "beta" + str(beta) + "SGD_epoch :" + str(SGD_epoch) + "lambd :" + str(lambd) + "lr : 2e-4 x2")
iteration_all = 0

itbis = 0
scores_deque = deque(maxlen=100)
for e in range(episode):
   # if beta <= 0.01:
   #     beta = 0.01
    #else :
    #    beta = beta*.99
    writer.add_scalar('beta_decrease',beta,e)
    # EVALUATION STEP
    # collect trajectories
    states_tab, states1 , action1, reward1, prob1, done1, states2 , action2, reward2, prob2, done2,reward_episode = collect_trajectories_double(env,env_info, policy,policy2, device,tmax)
    total_rewards = np.mean(np.sum(reward_episode,axis=0))

    for r in reward_episode:
        itbis+=1
        writer.add_scalar('Score_agent1',r[0],itbis)
        writer.add_scalar('Score_agent2',r[1],itbis)
        scores_deque.append(np.max(r))
        writer.add_scalar('Score_espisode_mean',np.mean(scores_deque),itbis)
    
    # Compute advantages estimate for first agent
    Delta_t1 = TD_evaluation(critic,states_tab,reward1,discount_rate,device)
    writer.add_scalar('DeltaT1',torch.mean(Delta_t1),iteration_all)
    Delta_t1 = GAE_evaluation(Delta_t1,discount_rate,lambd)
    writer.add_scalar('Advantage1',torch.mean(Delta_t1),iteration_all)
    
    # Compute advantages estimate for the second agent
    Delta_t2 = TD_evaluation(critic2,states_tab,reward2,discount_rate,device)
    writer.add_scalar('DeltaT2',torch.mean(Delta_t2),iteration_all)
    Delta_t2 = GAE_evaluation(Delta_t2,discount_rate,lambd)
    writer.add_scalar('Advantage2',torch.mean(Delta_t2),iteration_all)
    
    # To keep trajectories and deltaT accordingly for first agent
    states1 = torch.stack(states1)[:-1]
    action1 = torch.stack(action1)[:-1]
    prob1 = torch.stack(prob1)[:-1]
    reward1 = np.asarray(reward1)[:-1]
    
    # To keep trajectories and deltaT accordingly for second agent
    states2 = torch.stack(states2)[:-1]
    action2 = torch.stack(action2)[:-1]
    prob2 = torch.stack(prob2)[:-1]
    reward2 = np.asarray(reward2)[:-1]
    
    states_tab = torch.stack(states_tab)[:-1]
    
    # TRAINING STEP
    indices = torch.split(torch.from_numpy(np.arange(0,states1.shape[0],1)),batch_size,0) # Make chunk of the trajectory
    for epoch in range(SGD_epoch):
        # TRAINING OVER THE BATCH SIZE
        for chunks in indices:
            iteration_all += 1
            chunk = chunks.long()
            chunk_numpy = chunk.numpy().astype('int')
            
            # Pick Chunk of the trajectories
            states_chunk1 = states1[chunk]
            actions_chunk1 = action1[chunk]
            prob_chunk1 = prob1[chunk]
            rewards_chunk1 = reward1[chunk_numpy]
            Delta_t_chunk1 = Delta_t1[chunk]
            rewards_chunk1 = rewards_chunk1.tolist()
            
            states_chunk2 = states2[chunk]
            actions_chunk2 = action2[chunk]
            prob_chunk2 = prob2[chunk]
            rewards_chunk2 = reward2[chunk_numpy]
            Delta_t_chunk2 = Delta_t2[chunk]
            rewards_chunk2 = rewards_chunk2.tolist()
            
            state_full = states_tab[chunk]
            
            # Learning of first agent via surrogate trainning
            L1 = clipped_surrogate(Delta_t_chunk1,critic,optimizer_c,device,policy,prob_chunk1,actions_chunk1, states_chunk1,state_full, rewards_chunk1,batch_size, epsilon=epsilon, beta=beta)
            optimizer.zero_grad()
            L1.backward()
            optimizer.step()
            writer.add_scalar('Loss/Policy1',L1,iteration_all)
            
            # Learning of second agent via surrogate trainning
            L2 = clipped_surrogate(Delta_t_chunk2,critic2,optimizer_c2,device,policy2,prob_chunk2,actions_chunk2, states_chunk2,state_full, rewards_chunk2,batch_size, epsilon=epsilon, beta=beta)
            optimizer2.zero_grad()
            L2.backward()
            optimizer2.step()
            writer.add_scalar('Loss/Policy2',L2,iteration_all)
            del L1
            del L2
    writer.add_scalar('Score',total_rewards,e)
            
    mean_rewards.append(total_rewards)
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("################################")
        print("Episode: {0:d}, score: {1:f}".format(e+1,total_rewards))
        print(total_rewards)

################################
Episode: 20, score: 5.465000
5.465000086463988
################################
Episode: 40, score: 5.465000
5.46500008739531
################################
Episode: 60, score: 5.695000
5.695000088773668
################################
Episode: 80, score: 5.295000
5.2950000846758485
################################
Episode: 100, score: 5.685000
5.685000088997185
################################
Episode: 120, score: 6.020000
6.020000092685223
################################
Episode: 140, score: 5.630000
5.630000088363886
################################
Episode: 160, score: 5.510000
5.510000088252127
################################
Episode: 180, score: 5.980000
5.980000091716647
################################
Episode: 200, score: 5.360000
5.36000008508563
################################
Episode: 220, score: 5.980000
5.98000009264797


In [206]:
policy(torch.stack(states1)[5]).sample() # DIRECT DES NAN.

tensor([nan, nan])

In [207]:
policy2(torch.stack(states2)[1]).sample() # DIRECT DES NAN.

tensor([nan, nan])

In [None]:
writer.close()

In [127]:
torch.save(policy2.state_dict(), 'PPO_actor2_stable_finish.pth')
torch.save(critic2.state_dict(), 'PPO_critic2_stable_finish.pth')

torch.save(policy.state_dict(), 'PPO_actor_stable_finish.pth')
torch.save(critic.state_dict(), 'PPO_critic_stable_finish.pth')

In [None]:
def log_prob(self, value):
    if self._validate_args:
        self._validate_sample(value)
    # compute the variance
    var = (self.scale ** 2) #VARIANCE POSITIVE OK SIGMA TOUJOURS POSITIVES NORMALEMENT.
    log_scale = math.log(self.scale) if isinstance(self.scale, Real) else self.scale.log()
    return -((value - self.loc) ** 2) / (2 * var) - log_scale - math.log(math.sqrt(2 * math.pi))

In [213]:
import math

In [214]:
b = math.nan

In [217]:
print(b + 1e-5)

nan
