# Continuous Control

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from unityagents import UnityEnvironment
import collections
from multiprocessing import Process
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [2]:
env = UnityEnvironment(file_name="C:/Users/gabyc/Desktop/Reinforcment_TP/deep-reinforcement-learning/p3_collab-compet/Tennis_Windows_x86_64/Tennis.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


# Network

In [None]:
class Policy(nn.Module):
    # Actor network 
    def __init__(self,input_size,nb_action):
        super(Policy, self).__init__()
        self.nb_action = nb_action
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.fc1 = nn.Linear(input_size,200)
        self.fc2 = nn.Linear(200,75)
        self.fc2bis = nn.Linear(200,75)
        self.fc3 = nn.Linear(75,nb_action)
        self.fc3bis = nn.Linear(75,nb_action)
        
        for name, param in self.fc1.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
            
        for name, param in self.fc2.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
        
        for name, param in self.fc2bis.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
                
        for name, param in self.fc3bis.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
                
        for name, param in self.fc3.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
        
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        #x2 = x.clone()
        x = F.relu(self.fc2(x))
        mu = F.tanh(self.fc3(x)) # Tanh because action_values between -1 and 1.
        #sigma = F.sigmoid(self.fc2bis(x2))
        #sigma = torch.clamp(F.sigmoid(self.fc3bis(sigma)),min=0.1)
        sigma = torch.ones(self.nb_action,requires_grad=False).to(self.device)/4
        m = torch.distributions.normal.Normal(mu,sigma,False)# False, whereas constraint on mu = 0
        return m

In [None]:
class Critic(nn.Module):
    # Critic network 
    def __init__(self,input_size):
        super(Critic, self).__init__()
        
        self.fc1 = nn.Linear(input_size,150)
        self.fc2 = nn.Linear(150,50)
        self.fc3 = nn.Linear(50,1) # 1 output -> Value estimate
        
        for name, param in self.fc1.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
            
        for name, param in self.fc2.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
                
        for name, param in self.fc3.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return  F.leaky_relu(self.fc3(x)) 

# Training

In [None]:
def New_prob(policy,states,actions,device):
    # The Gradient FLOW on action
    # The Gradient fon't FLOW on state 
    # No Clipping.
    Tab = []
    Action_sample_tab = []
    m = policy(states[0])
    
    proba = m.log_prob(actions[0] + 1e-10)

    # STORE
    Tab.append(proba)
    Action_sample_tab.append(actions[0])
    
    # Loop over the state and action (a,s)
    for state_iter,action_iter in zip(states[1:],actions[1:]):
        m = policy(state_iter)
        proba = m.log_prob(action_iter + 1e-10) # Prob on the previous action but new policy
   
        # STORE
        Tab.append(proba)
        Action_sample_tab.append(action_iter)

    return torch.stack(Tab),torch.stack(Action_sample_tab)

In [None]:
def clipped_surrogate(Delta_t,critic,device,policy, old_probs,actions, states, rewards,batch_size,nb_agent,
                      discount = 0.995, epsilon=0.1, beta=0.01):
    
    rewards = np.asarray(rewards).mean(axis=1)
    old_probs_extract = old_probs[:,nb_agent,:]
    states_extract = states[:,nb_agent,:]
    actions_extract = actions[:,nb_agent,:]

    
    # Convert REWARD TO REWARD FUTURE
    reward_futur = np.zeros(rewards.shape[0])
    longueur = rewards.shape[0] - 1
    reward_futur[longueur] = rewards[longueur]
    new_discount = 0
    for i in range(1,rewards.shape[0]):
        new_discount = discount**(longueur-i) 
        reward_futur[longueur-i] = reward_futur[longueur-(i-1)] + rewards[longueur-i]*new_discount
        
    #Normalize At
    Delta_t = Delta_t.detach()
    Delta_t = Delta_t[:,None].repeat(1, old_probs.shape[2])
    Delta_t = (Delta_t- Delta_t.mean())/Delta_t.std()
    
    new_prob,action_sample = New_prob(policy, states_extract,actions_extract,device)
    
    # Compute each 
    Fraction = torch.exp(new_prob-(old_probs_extract+1e-10))
    Cote1 = Delta_t*Fraction 
    Cote2 = Delta_t*torch.clamp(Fraction, 1-epsilon, 1+epsilon) 
    Cote1 = Cote1[:, :,None]
    Cote2 = Cote2[:, :,None]
    comp = torch.cat((Cote1, Cote2),2)
    Gradient = torch.min(comp,2)[0].to(device) # Surrogate function


    entropy = -(torch.exp(new_prob)*old_probs_extract+1.e-10)+ \
        (1.0-torch.exp(new_prob))*(1.0-old_probs_extract+1.e-10) # Entropy to enhance exploration

    writer.add_scalar('Entropy',torch.mean(beta*(entropy)),iteration_all)
    writer.add_scalar('Gradient',torch.mean(Gradient),iteration_all)
    
    # ATTENTION ENTRAINE DEUX FOIS.
    # REWARD TRAINING ON BOTH REWARD.
    MSE = TD_Training(critic,states,rewards,actions,discount,device) # Critic network training
    writer.add_scalar('Loss/Critic',MSE,iteration_all)

    return -torch.mean(beta*(entropy) + Gradient)
        

In [None]:
def TD_Training(Critic,states,reward,actions,discount,device):
    states = states.detach()
    reward = torch.from_numpy(reward).detach()
    value_loss = []
    for st,ac in zip(states,actions):
        stack = torch.cat((st.flatten(),ac.flatten()),axis=0)
        Valuet = Critic(stack)
        value_loss.append(Valuet)
    
    #print("TD_Train")
    #print(torch.stack(value_loss).shape)
    #print(reward.to(device).mean(axis=1)[:,None].shape)
    # Mean or Not Mean
    Loss = 0.5*(discount*reward.to(device)[:,None] - torch.stack(value_loss)).pow(2).mean() # Simple MSE Loss
    optimizer_c.zero_grad()
    Loss.backward()
    optimizer_c.step()
    return Loss

# Evaluation

In [None]:
def collect_trajectories(env,env_info,policy1,policy2,device,tmax):
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
     # get the current state (for each agent)
    states_tab , action_tab, reward_tab, prob_tab = [],[],[], []
    reward_episode = []
    t = 0
    while t < tmax:
        reward_episode_temp = []
        env_info = env.reset(train_mode=True)[brain_name] 
        state = env_info.vector_observations # A VERIFIER CALAGE ACTIONS /STATES
        while True:
            state = torch.from_numpy(state).to(device)
            policy1.eval()
            policy2.eval()
            with torch.no_grad(): # Everything with torch no grad.

                ################ FIRST AND SECOND AGENT ################
                m = policy1(state[0])
                m2 = policy2(state[1])

                # Sample 1 and 2
                sample = m.sample()
                sample2 = m2.sample()
                action_tab.append(torch.stack([sample,sample2])) # No clip and store

                # Proba not on clip and detach from Gradient.
                proba = m.log_prob(sample)
                proba2 = m2.log_prob(sample2)

                # Interact with the environment 
                sample = torch.clip(sample.detach().cpu(), -1, 1)
                sample2 = torch.clip(sample2.detach().cpu(), -1, 1)
                sample = np.expand_dims(sample.numpy(),axis=0)
                sample2 = np.expand_dims(sample2.numpy(),axis=0)
                sample = np.concatenate([sample,sample2],axis=0)

                # Step the environment
                env_info = env.step(sample)[brain_name]           # send all actions to the environment
                next_states = env_info.vector_observations         # get next state (for each agent)
                rewards = env_info.rewards                         # get reward (for each agent)
                dones = env_info.local_done                        # see if episode finished

                # Store values
                prob_tab.append(torch.stack([proba,proba2]))
                reward_tab.append(np.asarray(rewards))
                states_tab.append(state)
                reward_episode_temp.append(np.asarray(rewards))

                # BREAK IF END OF THE EPISODE
                if np.any(dones):                                  # exit loop if episode finished
                    reward_episode.append(np.sum(reward_episode_temp,axis=0))
                    # Write np max after to see the reward of the two agent
                    break
                if t >= tmax:
                    break
                state = next_states
                t +=1
    #writer.add_histogram('MU/Sample_mu_action0',torch.mean(torch.stack(action_tab)[:,:,0],axis=1),iteration_all)
    #writer.add_histogram('MU/Sample_mu_action1',torch.mean(torch.stack(action_tab)[:,:,1],axis=1),iteration_all)
    #writer.add_histogram('MU/Sample_mu_action2',torch.mean(torch.stack(action_tab)[:,:,2],axis=1),iteration_all)
    #writer.add_histogram('MU/Sample_mu_action3',torch.mean(torch.stack(action_tab)[:,:,3],axis=1),iteration_all)
    return states_tab, action_tab, reward_tab,prob_tab,reward_episode

In [None]:
def TD_evaluation(Critic,states,actions,reward,discount,device):
    # Calculate TD error during the evaluation step
    Delta_t = []
    Tab = []
    Critic.eval()
    with torch.no_grad():
        Valuet = Critic(torch.cat((states[0].flatten(),actions[0].flatten()),axis=0))
    
        for rw,st,ac in zip(reward[0:],states[1:],actions[1:]):
            stack = torch.cat((st.flatten(),ac.flatten()),axis=0)
            Valuetplus1 = Critic(stack)
            Tab.append(Valuetplus1)
            
            # IT HAS BEEN DEALT WITH THE DIMENSION HERE
            TD_error = torch.from_numpy(rw).to(device).mean() + discount*Valuetplus1[0] - Valuet[0] #TD ERROR
            # CHANGE THING HERE DOESN'T CORRESPOND WELL. Not the same shape for now
            Delta_t.append(TD_error)
            
            Valuet = Valuetplus1
    print(torch.stack(Tab))
    #writer.add_histogram('Values',torch.mean(torch.stack(Tab),axis=1),e)
    writer.add_histogram('Values',torch.stack(Tab),e)
    # Torch mean usefull only if two reward
    return torch.stack(Delta_t)

In [None]:
def GAE_evaluation(Delta_t,discount,lambd):
    # GAE Function adapted from https://github.com/numblr/drlnd-cocontrol
    # Something strange about the dimensions here
    #print(Delta_t.shape)
    flipped = torch.flip(Delta_t, dims=(0,))
    result = torch.zeros_like(flipped)
    result[0] = flipped[0]  #Changed here 1D
    for i in range(1, flipped.size()[0]):
        result[i] = discount * lambd * result[i-1] + flipped[i] # Changed here 1D

    return torch.flip(result, dims=(0,))

# Launch Main code

In [105]:
# get the default brain
from collections import deque
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]  
states = env_info.vector_observations # get the current state (for each agent
num_agents = len(states)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
nb_states = len(states[0])
action_size = brain.vector_action_space_size
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### First Agent
policy1 = Policy(nb_states,action_size).to(device) # Policy network
optimizer1 = optim.Adam(policy1.parameters(), lr=4e-4)

### Second Agent
policy2 = Policy(nb_states,action_size).to(device) # Policy network
optimizer2 = optim.Adam(policy2.parameters(), lr=4e-4)

# Central Critic
critic = Critic(nb_states*2+action_size*2).to(device) # Critic network
optimizer_c = optim.Adam(critic.parameters(), lr=4e-4)
writer = SummaryWriter()

In [106]:
###################################################### MAIN_CODE #################################################
# training loop max iterations
episode = 10000


tmax = 1000
discount_rate = .9997
epsilon = 0.04 # try less than 0.1?
beta = .01
SGD_epoch = 6
batch_size = 128 #64
lambd = 0.80

# keep track of progress
mean_rewards = []
aleatoire = False
writer.add_text("CONFIG","aleatoire :" + str(aleatoire) + "tmax :" + str(tmax) + "batch_size :" + str(batch_size) + "discount_rate :" + str(discount_rate) + "epsilon" + str(epsilon)+ "beta" + str(beta) + "SGD_epoch :" + str(SGD_epoch) + "lambd :" + str(lambd) + "lr : 2e-4 x2")
iteration_all = 0

# NEW VARIABLE ADDED
itbis = 0
scores_deque = deque(maxlen=100)
All_av_Score = []
for e in range(episode):
    
    # EVALUATION STEP
    # collect trajectories
    states, actions, rewards,prob,reward_episode = collect_trajectories(env, env_info, policy1, policy2, device, tmax)
    total_rewards = np.mean(np.sum(rewards,axis=0))
    # REWARD COMPUTATION
    for r in reward_episode:
        itbis+=1
        writer.add_scalar('Score_agent1',r[0],itbis)
        writer.add_scalar('Score_agent2',r[1],itbis)
        scores_deque.append(np.max(r))
        writer.add_scalar('Score_espisode_mean',np.mean(scores_deque),itbis)
        
    
    # Compute advantages estimate
    Delta_t = TD_evaluation(critic,states,actions,rewards,discount_rate,device)
    writer.add_scalar('DeltaT',torch.mean(Delta_t),iteration_all)
    Delta_t = GAE_evaluation(Delta_t,discount_rate,lambd)
    writer.add_scalar('Advantage',torch.mean(Delta_t),iteration_all)
    
    states = torch.stack(states)[:-1]
    actions = torch.stack(actions)[:-1]
    prob = torch.stack(prob)[:-1]
    rewards = np.asarray(rewards)[:-1]
    
    # TRAINING STEP
    indices = torch.split(torch.from_numpy(np.arange(0,states.shape[0],1)),batch_size,0) # Make chunk of the trajectory
    for epoch in range(SGD_epoch):
        # TRAINING OVER THE BATCH SIZE
        for chunks in indices:
            iteration_all += 1
            chunk = chunks.long()
            chunk_numpy = chunk.numpy().astype('int')

            states_chunk = states[chunk]
            actions_chunk = actions[chunk]
            prob_chunk = prob[chunk]
            rewards_chunk = rewards[chunk_numpy]
            Delta_t_chunk = Delta_t[chunk]
            rewards_chunk = rewards_chunk.tolist()
            
            # First Version Separate Clipped with no parallel simple learning
            nb_agent = 0
            L1 = clipped_surrogate(Delta_t_chunk,critic,device,policy1,prob_chunk,actions_chunk, states_chunk, rewards_chunk,batch_size,nb_agent, epsilon=epsilon, beta=beta)
            optimizer1.zero_grad()
            L1.backward()
            optimizer1.step()
            
            nb_agent = 1
            L2 = clipped_surrogate(Delta_t_chunk,critic,device,policy2,prob_chunk,actions_chunk, states_chunk, rewards_chunk,batch_size, nb_agent,epsilon=epsilon, beta=beta)
            optimizer2.zero_grad()
            L2.backward()
            optimizer2.step()
            
            writer.add_scalar('Loss/Policy1',L1,iteration_all)
            writer.add_scalar('Loss/Policy2',L2,iteration_all)
            del L1
            del L2
    writer.add_scalar('Score',total_rewards,e)
            
    mean_rewards.append(total_rewards)
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("################################")
        print("Episode: {0:d}, score: {1:f}".format(e+1,total_rewards))
        print(total_rewards)
        

tensor([[-0.0058],
        [ 0.4858],
        [ 1.6755],
        ...,
        [ 3.6385],
        [ 2.1424],
        [ 1.9187]])
tensor([[-0.0275],
        [-0.0297],
        [-0.0508],
        ...,
        [-0.0470],
        [-0.0439],
        [-0.0383]])
tensor([[-0.0244],
        [-0.0222],
        [-0.0424],
        ...,
        [-0.0376],
        [-0.0315],
        [-0.0367]])
tensor([[-0.0196],
        [-0.0277],
        [-0.0203],
        ...,
        [-0.0136],
        [-0.0488],
        [-0.0065]])
tensor([[-0.0196],
        [-0.0230],
        [-0.0296],
        ...,
        [-0.0159],
        [-0.0154],
        [-0.0184]])
tensor([[-0.0006],
        [-0.0257],
        [-0.0219],
        ...,
        [-0.0129],
        [-0.0198],
        [-0.0200]])
tensor([[-0.0149],
        [-0.0167],
        [-0.0096],
        ...,
        [-0.0147],
        [-0.0206],
        [-0.0158]])
tensor([[-0.0141],
        [-0.0116],
        [-0.0165],
        ...,
        [-0.0183],
        [-0.002

KeyboardInterrupt: 

In [None]:
writer.close()

In [207]:
torch.save(policy1.state_dict(), 'PPO_actor1_stable.pth')
torch.save(policy2.state_dict(), 'PPO_actor2_stable.pth')
torch.save(critic.state_dict(), 'PPO_critic_stable.pth')