In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from unityagents import UnityEnvironment
import collections
from multiprocessing import Process
import torch.optim as optim

In [None]:
env = UnityEnvironment(file_name="C:/Users/gabyc/Desktop/Reinforcment_TP/deep-reinforcement-learning/p2_continuous-control/Multi_agent/Reacher_Windows_x86_64/Reacher.exe")

# Models

In [None]:
class Policy(nn.Module):

    def __init__(self,input_size,nb_action):
        super(Policy, self).__init__()
        self.nb_action = nb_action
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.fc1 = nn.Linear(input_size,150)
        self.fc2 = nn.Linear(150,75)
        self.fc3 = nn.Linear(75,nb_action)
        self.sigma = torch.ones(self.nb_action,requires_grad=False).to(self.device)/2
        
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = F.tanh(self.fc3(x)) # Tanh because action_values between -1 and 1.
        #sigma = F.softplus(self.fc3bis(x))# Activation to stay always >= 0
        #sigma = torch.clamp(sigma,0.001) # Activation to stay always > 0
        sigma = self.sigma
        m = torch.distributions.normal.Normal(mu,sigma,False) # False, whereas constraint on mu = 0
        return m

In [None]:
class Critic(nn.Module):

    def __init__(self,input_size):
        super(Critic, self).__init__()
        
        self.fc1 = nn.Linear(input_size,150)
        self.fc2 = nn.Linear(150,50)
        self.fc3 = nn.Linear(50,1)
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return  F.relu(self.fc3(x)) 

# Evaluation

In [None]:
def collect_trajectories(env,env_info,policy,device,tmax):
    # DEAL WITH THAT OLD_PROB AND ACTION ARE DIFFERENT NOW.
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    state = env_info.vector_observations # get the current state (for each agent)
    states_tab , action_tab, reward_tab, prob_tab = [],[],[], []
    t = 0
    while True:
        state = torch.from_numpy(state).to(device)
        policy.eval()
        with torch.no_grad(): # Everything with torch no grad.
            #proba,action_sample,mu = policy(state) # Batch of 21
            m = policy(state) 

        
            # Sample maybe on gradient as to check that
            sample = m.sample() 
            action_tab.append(sample) # No clip and store

            # Proba not on clip and detach from Gradient.
            proba = m.log_prob(sample)
            #proba = torch.exp(proba) #Proba on CUDA no detach
            
            # Interact with the environment 
            sample = torch.clip(sample.detach().cpu(), -1, 1) # CLIP BEFORE TAKING THE PROBA OR AFTER?
            sample = sample.numpy()


            # Step the environment
            env_info = env.step(sample)[brain_name]           # send all actions to the environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished

            # Store values
            prob_tab.append(proba)
            reward_tab.append(np.asarray(rewards))
            states_tab.append(state)

            # BREAK IF END OF THE EPISODE
            if np.any(dones):                                  # exit loop if episode finished
                break
            if t >= tmax:
                break
            state = next_states
            t +=1
    return states_tab, action_tab, reward_tab,prob_tab

In [None]:
def TD_evaluation(Critic,states,reward,discount,device):
    Delta_t = []
    Critic.eval()
    with torch.no_grad(): 
        Valuet = Critic(states[0])
    
        for rw,st in zip(reward[1:],states[1:]):
            Valuetplus1 = Critic(st)
            TD_error = torch.from_numpy(rw).to(device).unsqueeze(1) + discount*Valuetplus1 - Valuet
            Delta_t.append(TD_error)
            
            Valuet = Valuetplus1
            
    return torch.stack(Delta_t)

# Training

In [None]:
def clipped_surrogate_critic(device,estimator_At,policy, old_probs,actions, states, rewards,batch_size
                      ,critic,discount = 0.995, epsilon=0.1, beta=0.01):
    
    old_probs = torch.stack(old_probs)
    
    
    # Convert REWARD TO REWARD FUTURE
    rewards = np.asarray(rewards)
    #rewards = torch.from_numpy(rewards)
    reward_futur = np.zeros((rewards.shape[0],rewards.shape[1]))
    longueur = rewards.shape[0] - 1
    reward_futur[longueur] = rewards[longueur]
    new_discount = 0
    for i in range(1,rewards.shape[0]):
        new_discount = discount**(longueur-i) 
        reward_futur[longueur-i] = reward_futur[longueur-(i-1)] + rewards[longueur-i]*new_discount
        
    # Compute normalize reward
    #mean = np.mean(reward_futur, axis=1)
    #std = np.std(reward_futur, axis=1)+1.0e-10
    #normalized_rewards = (reward_futur-mean[:, np.newaxis])/std[:, np.newaxis]
    #normalized_rewards = torch.from_numpy(normalized_rewards).float().to(device)
    #normalized_rewards = normalized_rewards.unsqueeze(2)
    #normalized_rewards = normalized_rewards.repeat(1, 1, old_probs.shape[2])
    
    estimator_At = estimator_At.detach()
    estimator_At = estimator_At.repeat(1, 1, old_probs.shape[2])
    #print(estimator_At.shape)
    mean = estimator_At.mean(dim=1)
    std = estimator_At.std(dim=1)+1.0e-10
    mean = mean.unsqueeze(1).repeat(1,estimator_At.shape[1],1)
    std = std.unsqueeze(1).repeat(1,estimator_At.shape[1],1)
    #print(mean.shape)
    estimator_At = (estimator_At-mean)/std
    
    
    ### SHUFFLE AND MAKING CHUNK ##
    indexes = torch.randperm(old_probs.shape[0])
    indexes_numpy = indexes.numpy().astype('int')
    
    #states = np.asarray(states)[indexes_numpy]
    #actions = np.asarray(actions)[indexes_numpy]
    #normalized_rewards = normalized_rewards[indexes]
    #old_probs = old_probs[indexes]
    Nb_split = int(old_probs.shape[0]/batch_size)
    

    indices = torch.split(torch.from_numpy(np.arange(0,estimator_At.shape[0],1)),batch_size,0)
    
    for chunks in indices:
        chunk = chunks.long()
        chunk_numpy = chunk.numpy().astype('int')

        states_chunk = torch.stack(states)[chunk] 
        rewards_chunk = rewards[chunk]
        reward_futur_chunk = reward_futur[chunk_numpy]
        actions_chunk =  torch.stack(actions)[chunk]
        #normalized_rewards_chunk = normalized_rewards[chunk]
        old_prob_chunk = old_probs[chunk]
        estimator_At_chunk = estimator_At[chunk]
        
        new_prob_chunk,action_sample_chunk = New_prob(policy, states_chunk,actions_chunk,device)
        
        # Normalisation des At

        # Compute each 
        Fraction = torch.exp(new_prob_chunk-(old_prob_chunk+1e-10))
        Cote1 = estimator_At_chunk*Fraction #*(action_sample-mu) 
        Cote2 = estimator_At_chunk*torch.clamp(Fraction, 1-epsilon, 1+epsilon) #*(action_sample-mu)
        Cote1 = Cote1[:, :,:, None]
        Cote2 = Cote2[:, :,:, None]
        comp = torch.cat((Cote1, Cote2),3)
        Gradient = torch.min(comp,3)[0].to(device)
        #print("There is Nan Gradient")
        #print(torch.isnan(Gradient).any())
        #print(Gradient)


        entropy = -(torch.exp(new_prob_chunk)*old_prob_chunk+1.e-10)+ \
            (1.0-torch.exp(new_prob_chunk))*(1.0-old_prob_chunk+1.e-10) # Non definit si une valeur est inférieure à 0
        #print("There is Nan entropy") 
        #print(torch.isnan(entropy).any())
        #print(torch.mean(beta*(entropy) + Gradient))
        L = - torch.mean(beta*(entropy) + Gradient)

        optimizer.zero_grad()
        L.backward()
        optimizer.step()
        del L
        # Change with Reward : rewards
        TD_Training(critic,states_chunk,reward_futur_chunk,discount,device)
        

In [None]:
def New_prob(policy,states,actions,device):
    # The Gradient FLOW on action
    # The Gradient fon't FLOW on state yet
    # No Clipping.
    Tab = []
    Action_sample_tab = []
    m = policy(states[0])
    
    proba = m.log_prob(actions[0])
    #probab = torch.exp(proba)
    #probab = torch.clamp(probab,0.001) ## Don't why there is negative Probability
    # Maybe deal with the Log without going to the exponential because of numeric diff
    
    # MAYBE CLIPPING AND MAYBE STILL SAMPLE SOMETHING TO DO (At -at)
    #action_sample = torch.clip(sample.detach(), -1, 1)
    #sample = m.sample()#.detach()
    
    # STORE
    Tab.append(proba)
    Action_sample_tab.append(actions[0])
    
    # Loop over the state and action (a,s)
    for state_iter,action_iter in zip(states[1:],actions[1:]):
        m = policy(state_iter)
        #sample = m.sample()#.detach()
        proba = m.log_prob(action_iter) # Prob on the previous action but new policy
        #probab = torch.exp(proba)
        #probab = torch.clamp(probab,0.001)
        
        # STORE
        Tab.append(proba)
        Action_sample_tab.append(action_iter)

    return torch.stack(Tab),torch.stack(Action_sample_tab)

In [None]:
def TD_Training(Critic,states,reward,discount,device):
    states = states.detach()
    reward = torch.from_numpy(reward).detach()
    value_loss = []
    for st in states:
        Valuet = Critic(st)
        value_loss.append(Valuet)
        
    Loss = 0.5 *(discount*reward.to(device).unsqueeze(2) - torch.stack(value_loss)).pow(2).mean()
    #Loss = 0.5 *(discount*reward[:,:,0].to(device).unsqueeze(2) - torch.stack(value_loss)).pow(2).mean()
    #print(Loss)
    optimizer.zero_grad()
    Loss.backward()
    optimizer.step()

# Main

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]  
states = env_info.vector_observations # get the current state (for each agent
num_agents = len(states)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
nb_states = len(states[0])
action_size = brain.vector_action_space_size
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy = Policy(nb_states,action_size).to(device)
optimizer = optim.Adam(policy.parameters(), lr=3e-4)
critic = Critic(nb_states).to(device)
optimizer = optim.Adam(critic.parameters(), lr=3e-4)

In [None]:
###################################################### MAIN_CODE #################################################
# training loop max iterations
episode = 500

# widget bar to display progress
#!pip install progressbar
#import progressbar as pb
#widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA() ]
#timer = pb.ProgressBar(widgets=widget, maxval=episode).start()


discount_rate = .99
epsilon = 0.1
beta = .01
tmax = 1200
SGD_epoch = 1
batch_size = 400

# keep track of progress
mean_rewards = []

for e in range(episode):

    # collect trajectories
    states, actions, rewards,prob = collect_trajectories_critic(env,env_info, policy,device,tmax)
    total_rewards = np.mean(rewards)
    print(total_rewards)
    
    #Delta_t = Online_TD_evaluation(critic,states,rewards,discount_rate,device)
    Delta_t = TD_evaluation(critic,states,rewards,discount_rate,device)
    #print(Delta_t.shape)

    # gradient ascent step
    for _ in range(SGD_epoch):
        
        # uncomment to utilize your own clipped function!
        clipped_surrogate_critic(device,Delta_t,policy, prob,actions, states, rewards,batch_size,critic, epsilon=epsilon, beta=beta)
        #L.requires_grad_() # I needed to do that to compute something but maybe that means that there is a bug.

    # the clipping parameter reduces as time goes on
    epsilon*=.999
    
    # the regulation term also reduces
    # this reduces exploration in later runs
    beta*=.999
    
    # get the average reward of the parallel environments
    mean_rewards.append(np.mean(total_rewards))
    
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("################################")
        print("Episode: {0:d}, score: {1:f}".format(e+1,np.mean(total_rewards)))
        print(total_rewards)
        
    # update progress widget bar
    #timer.update(e+1)
    
#timer.finish()