# Continuous Control

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
######################## DIFFERENCE BETWEEN PONG AND OUR ENVIRONMENT ##############################
# num_agent : 20 -> 20 agent
# Size of each action : 4 -> One action is size(4) AND continuous.
# Difference Not (1) -> But
# In PONG : 1 last output probability of action 5 or 4.
#     # convert states to policy (or probability)
# new_probs = pong_utils.states_to_prob(policy, states)
# new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0-new_probs)
# the output is the probability of moving right
# P(left) = 1-P(right)
# outi​={xi​yi​​if conditioni​otherwise​}

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from unityagents import UnityEnvironment
import collections
from multiprocessing import Process
import torch.optim as optim

In [16]:
class Policy(nn.Module):

    def __init__(self,input_size,nb_action):
        super(Policy, self).__init__()
        
        self.fc1 = nn.Linear(input_size,300)
        self.fc2 = nn.Linear(300,300)
        self.fc3 = nn.Linear(300,nb_action)
        self.fc3bis = nn.Linear(300,nb_action)
        
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = F.tanh(self.fc3(x))
        sigma = F.tanh(self.fc3bis(x)) # ¨b donne des sigma avec 0 veut que des sigma positif
        sigma = torch.exp(sigma)
        m = torch.distributions.normal.Normal(mu,sigma,False) # False, whereas constraint on mu = 0
        sample = m.sample()#.detach()
        proba = m.log_prob(sample)
        probab = torch.exp(proba)
        # action
        action_sample = torch.clip(sample.detach(), -1, 1)
        
        
        # Every entry in the action vector must be a number between -1 and 1
        return probab,action_sample,mu

In [64]:
class Policy(nn.Module):

    def __init__(self,input_size,nb_action):
        super(Policy, self).__init__()
        
        self.fc1 = nn.Linear(input_size,300)
        self.fc2 = nn.Linear(300,300)
        self.fc3 = nn.Linear(300,nb_action)
        self.fc3bis = nn.Linear(300,nb_action)
        
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = F.tanh(self.fc3(x))
        sigma = F.tanh(self.fc3bis(x)) # ¨b donne des sigma avec 0 veut que des sigma positif
        sigma = torch.exp(sigma)
        m = torch.distributions.normal.Normal(mu,sigma,False) # False, whereas constraint on mu = 0
        # Every entry in the action vector must be a number between -1 and 1
        return mu,m

In [86]:
def New_prob(policy,states):
    Tab = []
    Action_sample_tab = []
    mu_tab = []
    #proba,action_sample,mu = policy.forward(states[0])
    mu,m = policy.forward(states[0])
    sample = m.sample()#.detach()
    proba = m.log_prob(sample)
    probab = torch.exp(proba)
    #action_sample = torch.clip(sample.detach(), -1, 1)
    
    # STORE
    Tab.append(probab)
    Action_sample_tab.append(sample)
    mu_tab.append(mu)
    #new_probs = torch.tensor(proba.unsqueeze(0))
    for state_iter in states[1:]:
        #proba,action_sample,mu = policy.forward(state_iter)
        mu,m = policy.forward(states[0])
        sample = m.sample()#.detach()
        proba = m.log_prob(sample)
        probab = torch.exp(proba)
        Tab.append(probab)
        Action_sample_tab.append(sample)
        mu_tab.append(mu)
        #pr()
        #proba = proba.detach().cpu().numpy() # Not Detach because needed !
        #new_probs = torch.cat((new_probs,proba.unsqueeze(0)))
    return torch.stack(Tab),torch.stack(Action_sample_tab),torch.stack(mu_tab)

In [87]:
def clipped_surrogate(policy, old_probs, states, rewards,
                      discount = 0.995, epsilon=0.1, beta=0.01):
    
    #actions = torch.tensor(actions, dtype=torch.int8, device=device)

    # convert states to policy (or probability)
    new_probs,action_sample,mu = New_prob(policy, states)
    
    #new_probs = np.asarray(new_probs)
    #new_probs = torch.from_numpy(new_probs).to(device)
    #print(new_probs)
    #print(new_probs.shape) # [Nb_iteration,Nb_agent,Values entre -1 et 1]
    
    old_probs = np.asarray(old_probs)
    old_probs = torch.from_numpy(old_probs).to(device)
    #print(old_probs)
    Fraction = torch.div(new_probs,old_probs+1e-10) # Changed
    #print(Fraction.shape)
    
    # Convert REWARD TO REWARD FUTURE
    rewards = np.asarray(rewards)
    reward_futur = np.zeros((rewards.shape[0],rewards.shape[1]))
    longueur = rewards.shape[0] - 1
    reward_futur[longueur] = rewards[longueur]
    new_discount = 0
    for i in range(1,rewards.shape[0]):
        new_discount = discount**(longueur-i) 
        reward_futur[longueur-i] = reward_futur[longueur-(i-1)] + rewards[longueur-i]*new_discount
        
    # Compute normal reward
    mean = np.mean(reward_futur, axis=1)
    std = np.std(reward_futur, axis=1)+1.0e-10
    normalized_rewards = (reward_futur-mean[:, np.newaxis])/std[:, np.newaxis]
    normalized_rewards = torch.from_numpy(normalized_rewards).float().to(device)
    normalized_rewards = normalized_rewards.unsqueeze(2)
    normalized_rewards = normalized_rewards.repeat(1, 1, Fraction.shape[2])

    # Compute each 
    #print(normalized_rewards.shape)
    Cote1 = normalized_rewards*Fraction
    #print(Cote1)
    Cote2 = normalized_rewards*torch.clamp(Fraction, 1-epsilon, 1+epsilon)
    #print(Cote1)
    Cote1 = Cote1[:, :,:, None]
    Cote2 = Cote2[:, :,:, None]
    #print(Cote2.shape)
    comp = torch.cat((Cote1, Cote2),3)
    #print(comp.shape)
    Gradient = torch.min(comp,3)[0].to(device)
    #print(Gradient.shape)

    # include a regularization term
    # this steers new_policy towards 0.5
    # prevents policy to become exactly 0 or 1 helps exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    entropy = -(new_probs*torch.log(old_probs+1.e-10)+ \
        (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))
    #print(entropy.shape)
    #print(entropy)
    #L= torch.mean(beta*(entropy) +Gradient)
    
    return torch.mean(beta*(entropy) +Gradient) #*(action_sample-mu)) # Add Action_sample - mu for the Normal derivation

In [40]:
L = clipped_surrogate(policy, prob, states, rewards, epsilon=epsilon, beta=beta)

In [12]:
env = UnityEnvironment(file_name="C:/Users/gabyc/Desktop/Reinforcment_TP/deep-reinforcement-learning/p2_continuous-control/Multi_agent/Reacher_Windows_x86_64/Reacher.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [88]:
def collect_trajectories(env,env_info,policy,device):
    # DEAL WITH THAT OLD_PROB AND ACTION ARE DIFFERENT NOW.
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    state = env_info.vector_observations # get the current state (for each agent)
    states_tab , action_tab, reward_tab, prob_tab = [],[],[], []
    while True:
        state = torch.from_numpy(state).to(device)
        policy.eval()
        with torch.no_grad():
            #proba,action_sample,mu = policy(state) # Batch of 21
            mu,m = policy(state) 

        # Treatment
        #proba = proba.detach().cpu().numpy()
        #action = mu.detach().cpu().numpy()
        action = torch.clip(mu.detach().cpu(), -1, 1)
        action = action.numpy()
        sample = m.sample()       
        proba = m.log_prob(sample)
        proba = torch.exp(proba).detach().cpu().numpy()
        
        # Step the environment
        env_info = env.step(action)[brain_name]           # send all actions to the environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        
        # Store values
        action_tab.append(action)
        prob_tab.append(proba)
        reward_tab.append(np.asarray(rewards))
        states_tab.append(state)
        
        # BREAK IF END OF THE EPISODE
        if np.any(dones):                                  # exit loop if episode finished
            break
        state = next_states
    return states_tab, action_tab, reward_tab,prob_tab

# Launch Main code

In [89]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]  
states = env_info.vector_observations # get the current state (for each agent
num_agents = len(states)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
nb_states = len(states[0])
action_size = brain.vector_action_space_size
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy = Policy(nb_states,action_size).to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

In [None]:
###################################################### MAIN_CODE #################################################
# training loop max iterations
episode = 500

# widget bar to display progress
#!pip install progressbar
#import progressbar as pb
#widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA() ]
#timer = pb.ProgressBar(widgets=widget, maxval=episode).start()


discount_rate = .99
epsilon = 0.1
beta = .01
tmax = 320
SGD_epoch = 6

# keep track of progress
mean_rewards = []

for e in range(episode):

    # collect trajectories
    states, actions, rewards,prob = collect_trajectories(env,env_info, policy,device)
    total_rewards = np.mean(rewards)
    print(total_rewards)

    # gradient ascent step
    for _ in range(SGD_epoch):
        
        # uncomment to utilize your own clipped function!
        L = -clipped_surrogate(policy,prob, states, rewards, epsilon=epsilon, beta=beta)
        #L.requires_grad_() # I needed to do that to compute something but maybe that means that there is a bug.
        optimizer.zero_grad()
        L.backward()
        optimizer.step()
        del L
    
    # the clipping parameter reduces as time goes on
    epsilon*=.999
    
    # the regulation term also reduces
    # this reduces exploration in later runs
    beta*=.995
    
    # get the average reward of the parallel environments
    mean_rewards.append(np.mean(total_rewards))
    
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("################################")
        print("Episode: {0:d}, score: {1:f}".format(e+1,np.mean(total_rewards)))
        print(total_rewards)
        
    # update progress widget bar
    #timer.update(e+1)
    
#timer.finish()

0.0007842157666871955
0.000879620359959332
0.0004335664238754686
0.00019180818752094464
0.00011088910841054611
6.193806055363837e-05
8.19180800870701e-05
6.443556299531733e-05
2.297702246344649e-05
0.00011888111622391881
4.44555434618856e-05
2.7972027346804426e-05
5.044954932191512e-05
2.297702246344649e-05
1.1488511231723246e-05
6.193806055363837e-05
2.4475523928453873e-05
5.894105762362361e-05
8.741258545876383e-05
0.00011438561182889667
################################
Episode: 20, score: 0.000114
0.00011438561182889667
6.693306543699631e-05
6.143856006530258e-05
0.00013086912794397785
0.0002002996958226531
0.00048151847075570477
0.0006073925938163247
0.0005369630249609778
0.0004320679224104612
0.0007682317510604501
0.0005389610269143209
0.0006353646211631291
0.0006188811050480479
0.0003726273642985018
0.0003956043867619483
0.0005564435440060738
0.0008121877940339999
0.0009725274507897896
0.00095704293565138
0.0009730269512781254
0.0008946053746094058
###############################

In [230]:
env.close()

In [229]:
import matplotlib
#matplotlib.use('GTK3Agg')

# Tensorboard

In [213]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

TypeError: __new__() got an unexpected keyword argument 'serialized_options'

In [225]:
#import matplotlib
#matplotlib.use("Agg")
import hiddenlayer as hl

In [221]:
.shape

torch.Size([20, 33])

In [226]:
hl.build_graph(policy,states[0].detach().to(device))

TypeError: _jit_pass_onnx_graph_shape_type_inference(): incompatible function arguments. The following argument types are supported:
    1. (arg0: torch::jit::Graph, arg1: Dict[str, IValue], arg2: int) -> None

Invoked with: graph(%0 : Double(20, 33, strides=[33, 1], requires_grad=0, device=cuda:0),
      %1 : Float(300, 33, strides=[33, 1], requires_grad=1, device=cuda:0),
      %2 : Float(300, strides=[1], requires_grad=1, device=cuda:0),
      %3 : Float(300, 300, strides=[300, 1], requires_grad=1, device=cuda:0),
      %4 : Float(300, strides=[1], requires_grad=1, device=cuda:0),
      %5 : Float(4, 300, strides=[300, 1], requires_grad=1, device=cuda:0),
      %6 : Float(4, strides=[1], requires_grad=1, device=cuda:0),
      %7 : Float(4, 300, strides=[300, 1], requires_grad=1, device=cuda:0),
      %8 : Float(4, strides=[1], requires_grad=1, device=cuda:0)):
  %9 : Float(20, 33, strides=[33, 1], requires_grad=0, device=cuda:0) = onnx::Cast[to=1](%0) # <ipython-input-182-0488979f2f3b>:13:0
  %10 : Float(20, 300, strides=[300, 1], requires_grad=1, device=cuda:0) = onnx::Gemm[alpha=1., beta=1., transB=1](%9, %1, %2) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\nn\functional.py:1848:0
  %11 : Float(20, 300, strides=[300, 1], requires_grad=1, device=cuda:0) = onnx::Relu(%10) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\nn\functional.py:1299:0
  %12 : Float(20, 300, strides=[300, 1], requires_grad=1, device=cuda:0) = onnx::Gemm[alpha=1., beta=1., transB=1](%11, %3, %4) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\nn\functional.py:1848:0
  %13 : Float(20, 300, strides=[300, 1], requires_grad=1, device=cuda:0) = onnx::Relu(%12) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\nn\functional.py:1299:0
  %14 : Float(20, 4, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Gemm[alpha=1., beta=1., transB=1](%13, %5, %6) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\nn\functional.py:1848:0
  %15 : Float(20, 4, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Tanh(%14) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\nn\functional.py:1796:0
  %16 : Float(20, 4, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Gemm[alpha=1., beta=1., transB=1](%13, %7, %8) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\nn\functional.py:1848:0
  %17 : Float(20, 4, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Tanh(%16) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\nn\functional.py:1796:0
  %18 : Float(20, 4, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Exp(%17) # <ipython-input-182-0488979f2f3b>:18:0
  %19 : Long(2, strides=[1], device=cpu) = onnx::Shape(%15)
  %20 : Float(20, 4, device=cpu) = onnx::ConstantOfShape[value={0}](%19)
  %21 : Float(20, 4, strides=[4, 1], device=cpu) = onnx::Add(%20, %15)
  %22 : Float(20, 4, strides=[4, 1], device=cpu) = onnx::Add(%21, %18)
  %23 : Long(2, strides=[1], device=cpu) = onnx::Shape(%22)
  %24 : Float(20, 4, device=cpu) = onnx::Expand(%15, %23)
  %25 : Long(2, strides=[1], device=cpu) = onnx::Shape(%22)
  %26 : Float(20, 4, device=cpu) = onnx::Expand(%18, %25)
  %27 : Long(requires_grad=0, device=cpu) = onnx::Constant[value={20}]() # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:49:0
  %28 : Long(requires_grad=0, device=cpu) = onnx::Constant[value={4}]() # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:49:0
  %29 : Long(1, strides=[1], device=cpu) = onnx::Unsqueeze[axes=[0]](%27)
  %30 : Long(1, strides=[1], device=cpu) = onnx::Unsqueeze[axes=[0]](%28)
  %31 : Long(2, strides=[1], device=cpu) = onnx::Concat[axis=0](%29, %30)
  %32 : Long(1, strides=[1], device=cpu) = onnx::Constant[value={-1}]()
  %33 : Long(2, strides=[1], device=cpu) = onnx::Reshape(%31, %32)
  %34 : Long(1, strides=[1], device=cpu) = onnx::Shape(%33)
  %35 : Long(2, device=cpu) = onnx::ConstantOfShape[value={1}](%34)
  %36 : Long(requires_grad=0, device=cpu) = onnx::Constant[value={-1}]()
  %37 : Long(2, strides=[1], device=cpu) = onnx::Mul(%35, %36)
  %38 : Bool(2, strides=[1], device=cpu) = onnx::Equal(%33, %37)
  %39 : Long(2, strides=[1], device=cpu) = onnx::Where(%38, %35, %33)
  %40 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Expand(%24, %39) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:64:0
  %41 : Long(1, strides=[1], device=cpu) = onnx::Unsqueeze[axes=[0]](%27)
  %42 : Long(1, strides=[1], device=cpu) = onnx::Unsqueeze[axes=[0]](%28)
  %43 : Long(2, strides=[1], device=cpu) = onnx::Concat[axis=0](%41, %42)
  %44 : Long(1, strides=[1], device=cpu) = onnx::Constant[value={-1}]()
  %45 : Long(2, strides=[1], device=cpu) = onnx::Reshape(%43, %44)
  %46 : Long(1, strides=[1], device=cpu) = onnx::Shape(%45)
  %47 : Long(2, device=cpu) = onnx::ConstantOfShape[value={1}](%46)
  %48 : Long(requires_grad=0, device=cpu) = onnx::Constant[value={-1}]()
  %49 : Long(2, strides=[1], device=cpu) = onnx::Mul(%47, %48)
  %50 : Bool(2, strides=[1], device=cpu) = onnx::Equal(%45, %49)
  %51 : Long(2, strides=[1], device=cpu) = onnx::Where(%50, %47, %45)
  %52 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Expand(%26, %51) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:64:0
  %53 : Float(*, *, device=cpu) = onnx::RandomNormalLike(%40)
  %54 : Float(*, *, device=cpu) = onnx::Mul(%52, %53)
  %55 : Float(*, *, strides=[4, 1], requires_grad=0, device=cuda:0) = onnx::Add(%54, %40) # <ipython-input-182-0488979f2f3b>:24:0
  %56 : Long(requires_grad=0, device=cpu) = onnx::Constant[value={2}]()
  %57 : Float(device=cpu) = onnx::Cast[to=1](%56)
  %58 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Pow(%26, %57) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\_tensor.py:30:0
  %59 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Log(%26) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:76:0
  %60 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Sub(%55, %24) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:77:0
  %61 : Long(requires_grad=0, device=cpu) = onnx::Constant[value={2}]()
  %62 : Float(device=cpu) = onnx::Cast[to=1](%61)
  %63 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Pow(%60, %62) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\_tensor.py:30:0
  %64 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Neg(%63) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:77:0
  %65 : Float(requires_grad=0, device=cpu) = onnx::Constant[value={2}]()
  %66 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%58, %65)
  %67 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Div(%64, %66) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:77:0
  %68 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Sub(%67, %59) # C:\Users\gabyc\anaconda3\envs\Navigation2\lib\site-packages\torch\distributions\normal.py:77:0
  %69 : Float(requires_grad=0, device=cpu) = onnx::Constant[value={0.918939}]()
  %70 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Sub(%68, %69)
  %71 : Float(*, *, strides=[4, 1], requires_grad=1, device=cuda:0) = onnx::Exp(%70) # <ipython-input-182-0488979f2f3b>:22:0
  %72 : Long(requires_grad=0, device=cpu) = onnx::Constant[value={-1}]()
  %73 : Long(requires_grad=0, device=cpu) = onnx::Constant[value={1}]()
  %74 : Float(device=cpu) = onnx::Cast[to=1](%72)
  %75 : Float(device=cpu) = onnx::Cast[to=1](%73)
  %76 : Float(*, *, strides=[4, 1], requires_grad=0, device=cuda:0) = onnx::Clip(%55, %74, %75) # <ipython-input-182-0488979f2f3b>:24:0
  return (%71, %76)
, None, 11

# USELESS AFTER

In [56]:
print(L)

tensor(0.8390, device='cuda:0')


0.0005159840044508744

In [37]:
A =torch.tensor(1.00000e-03 *-6.9314, device='cuda:0')

In [39]:
from torchsummary import summary

ModuleNotFoundError: No module named 'torchsummary'

In [None]:
policy = Policy(nb_states,action_size).to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

In [58]:
for a in policy.parameters():
    print(a)

Parameter containing:
tensor([[ 0.0101, -0.0718,  0.0199,  ..., -0.0826, -0.1599, -0.0077],
        [ 0.1577,  0.0900, -0.0746,  ..., -0.0418, -0.1527,  0.0953],
        [ 0.0462,  0.0263,  0.1640,  ...,  0.1208,  0.0626, -0.0682],
        ...,
        [-0.0324,  0.1004, -0.1033,  ..., -0.0490, -0.1629,  0.1655],
        [ 0.0649, -0.0408,  0.0351,  ...,  0.1219, -0.1655, -0.0715],
        [-0.1006,  0.1355, -0.1460,  ..., -0.0651, -0.1526,  0.0762]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.1167,  0.0005, -0.0731,  0.1386,  0.0610,  0.0647, -0.1133, -0.0944,
        -0.0147,  0.1589, -0.0924, -0.0397,  0.1038,  0.0868, -0.1729, -0.0881,
         0.0176,  0.0112,  0.0870,  0.0435, -0.1395,  0.0897, -0.0384, -0.0323,
        -0.0146,  0.0589, -0.0394,  0.0956, -0.0070, -0.1229,  0.1619,  0.0851,
         0.0453, -0.1727, -0.1672, -0.0569,  0.1267, -0.1147, -0.1485, -0.1724,
        -0.1143,  0.0903, -0.0558, -0.0407, -0.0624, -0.1169,  0.1055,  0.0206

In [214]:
from torchviz import make_dot, make_dot_from_trace

In [74]:
states[0]

array([ 0.00000000e+00, -4.00000000e+00,  0.00000000e+00,  1.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -4.37113883e-08,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -1.00000000e+01,  0.00000000e+00,
        1.00000000e+00, -0.00000000e+00, -0.00000000e+00, -4.37113883e-08,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  8.36227417e-01, -1.00000000e+00,
        7.95617676e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        7.76270628e-01])

In [77]:
torch.onnx.export(policy, torch.from_numpy(states[0]).to(device), 'rnn.onnx',opset_version=11)

In [215]:
make_dot(policy(states[0][0]), params=dict(policy.named_parameters()), show_attrs=True, show_saved=True)

ExecutableNotFound: failed to execute 'dot', make sure the Graphviz executables are on your systems' PATH

<graphviz.graphs.Digraph at 0x25f61164860>

In [47]:
L

tensor([-0.6981, -0.6660, -0.9851, -0.5342], device='cuda:0',
       requires_grad=True)

In [None]:
########### FAIRE REINFORCE EN ATTENDANT CAR C'EST PLUS SIMPLE ##############
#probs = policy_network(state)
# Note that this is equivalent to what used to be called multinomial
#m = Categorical(probs)
#action = m.sample()
#next_state, reward = env.step(action)
#loss = -m.log_prob(action) * reward
#loss.backward()
#############################################################################

# Reinforce with twenty trajectory

Model

In [78]:
class Policy_Reinforce(nn.Module):

    def __init__(self,input_size,nb_action):
        super(Policy, self).__init__()
        
        self.fc1 = nn.Linear(input_size,300)
        self.fc2 = nn.Linear(300,300)
        self.fc3 = nn.Linear(300,nb_action)
        self.fc3bis = nn.Linear(300,nb_action)
        
        
    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = F.tanh(self.fc3(x))
        #print(mu)
        sigma = F.tanh(self.fc3bis(x)) # ¨b donne des sigma avec 0 veut que des sigma positif
        sigma = torch.exp(sigma)
        return sigma,mu

In [None]:
        #print(sigma)
        m = torch.distributions.normal.Normal(mu,sigma,False)
        #m = torch.distributions.multivariate_normal.MultivariateNormal(mu,sigma)
        sample = m.sample()#.detach()
        proba = m.log_prob(sample)
        #print("Proba")
        #print(proba)
        probab = torch.exp(proba)
        #print("Probab")
        # action
        action = torch.clip(sample.detach(), -1, 1)
        # Genertes a sample_shape shaped sample or sample_shape shaped batch of samples if the distribution parameters are batched.
        
        # Every entry in the action vector must be a number between -1 and 1

Init

In [None]:
###################################################### MAIN_CODE #################################################
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]  
states = env_info.vector_observations # get the current state (for each agent
num_agents = len(states)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
nb_states = len(states[0])
action_size = brain.vector_action_space_size
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy = Policy(nb_states,action_size).to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

MAIN

In [None]:
# training loop max iterations
episode = 500

# widget bar to display progress
#!pip install progressbar
#import progressbar as pb
#widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA() ]
#timer = pb.ProgressBar(widgets=widget, maxval=episode).start()


discount_rate = .99
epsilon = 0.1
beta = .01
tmax = 320
SGD_epoch = 4

# keep track of progress
mean_rewards = []

for e in range(episode):

    # collect trajectories
    states, actions, rewards, prob = collect_trajectories(env,env_info, Policy_Reinforce,device)
    total_rewards = np.sum(rewards, axis=0)
    print(total_rewards)

    

    # gradient ascent step
    #for _ in range(SGD_epoch):
        
        # uncomment to utilize your own clipped function!
        #L = -clipped_surrogate(policy,prob, states, rewards, epsilon=epsilon, beta=beta)
    L = 
    optimizer.zero_grad()
    L.backward()
    optimizer.step()
        #del L
    
    # the clipping parameter reduces as time goes on
    epsilon*=.999
    
    # the regulation term also reduces
    # this reduces exploration in later runs
    beta*=.995
    
    # get the average reward of the parallel environments
    mean_rewards.append(np.mean(total_rewards))
    
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("################################")
        print("Episode: {0:d}, score: {1:f}".format(e+1,np.mean(total_rewards)))
        print(total_rewards)

In [43]:
A = L
A.requires_grad = True

In [45]:
R = torch.mean(L)
print(R)

tensor(-0.7209, device='cuda:0', grad_fn=<MeanBackward0>)


In [46]:
R.backward()


In [101]:
env.close()

In [None]:
# NAA -----> FOR PPO CONTINUOUS ACTION do Actor-Critic algo <-----

In [None]:
-> On a en sortie [1001,20,4] <- Les valeurs des 4 actions. (Old_probs,New_probs) (pi(a,s),piprime(a,s))
-> Fraction = Old_probs/New_probs (Attention les produits valeurs -1 et 1 problem potentiel) -> Rendre positif avant
-> Clipping -> 

-> Reward * Fraction ([1001,20,4]*[1001,20,4])

