In [25]:
import gym
import torch
from a3_gym_env.envs.pendulum import CustomPendulumEnv
import numpy as np

from torch import optim

from torch.distributions import Normal

from torch.optim import Adam
from torch.distributions.multivariate_normal import MultivariateNormal
from Modules import PolicyNetwork, ExperienceReplayBuffer, CriticNetwork

# [DONE] Task 1: Start by implementing an environment interaction loop. You may refer to homework 1 for inspiration.
# [ ] Task 2: Create and test an experience replay buffer with a random policy, which is the Gaussian distribution with arbitrary (randomly initialized) weights of the policy feed-forward network,receiving state, s, and returning the mean, mu(s) and the log_std, log_stg(s) (natural logarithm of the standard deviation) of actions.  As mentioned above, you can use a state-independent standard variance.
# [ ] Task 3: Make an episode reward processing function to turn one-step rewards into discounted rewards-to-go: R(s_1) = sum_{t=1} gamma^{t-1} r_t, which is the discounted reward, starting from the state, s_1.
# [ ] Task 4: Start the model by implementing a vanilla policy gradient agent, where the gradient ascent stepsare done with the average of the gradient of log-likelihood over a trajectory weight by rewards-to-go   from each state. Try different step sizes in the gradient ascent.
# [ ] Task 5: Pendulum is a continuous action space environment. Check out the example in `Modules.py` for torch implementation of the Gaussian module.  (if you work in Julia, speak with me regarding the pendulum dynamics in Julia, and Flux for DNNs.)
# [ ] Task 6: Add a feed-forward network for the critic, accepting the state, s=[sin(angle), cos(angle), angular velocity], and returning a scalar for the value of the state, s.
# [ ] Task 7: Implement the generalized advantage, see Eq11-12 in the PPO paper, to be used instead of rewards-to-go.
# [ ] Task 8: Implement the surrogate objective for the policy gradient, see Eq7, without and without clipping.
# [ ] Task 9: Implement the total loss, see Eq9 in the PPO.
# [ ] Task 10: Combine all together to Algorithm 1 in the PPO paper. (In your basic implementation, you can collect data with a single actor, N=1)
# [ ] Task 11: You should see progress with default hyperparameters, but you can try tuning those to see how it will improve your results.

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps")


def interaction_loop():
    env = gym.make("Pendulum-v1-custom")
    # sample hyperparameters
    batch_size = 1000
    epochs = 30
    learning_rate = 1e-2
    #hidden_size = 8
    #n_layers = 2

    # optimizer = optim.Adam(vae.parameters(), lr=learning_rate)

    max_step = 1000
    obs = env.reset()
    for _ in range(max_step):
        # get a random action in this environment
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        # render already plots the graph for you, no need to use plt
        img = env.render()
        if done:
            obs = env.reset()
    env.close()

    
# task 2 * Create and test an experience replay buffer with a random policy, which is the 
#Gaussian distribution with arbitrary (randomly initialized) weights of the policy feed-forward network,
#receiving state, s, and returning the mean, mu(s) and the log_std, log_stg(s) 
#(natural logarithm of the standard deviation) of actions.  As mentioned above, you can use 
#a state-independent standard variance.
def test_experience_relay_buffer():
    env = gym.make("Pendulum-v1-custom")
    # sample hyperparameters
    batch_size = 1000
    epochs = 30
    learning_rate = 0.01
    hidden_size = 8
    n_layers = 2
    
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    max_step = 1000
    
    policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
    
    memory = ExperienceReplayBuffer(batch_size)

    for _ in range(max_step):
        # get a random action in this environment
        action = env.action_space.sample()

        obs, reward, done, info = env.step(action)
        next_state = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
        memory.push(state, action, next_state, reward)

        gaus_param = policy(state)
        
        # print(gaus_param)
        state = next_state
        # render already plots the graph for you, no need to use plt
        img = env.render()
        if done:
            obs = env.reset()
    
    env.close()


# Sample an action from Normal distribution
def choose_action(policy, state):  
    
    state = torch.from_numpy(state).float().unsqueeze(0)
    act_prob, v = policy(state)
    v = torch.exp(v)
    pd = Normal(act_prob, torch.sqrt(v)) # get normal distirbution
    action = pd.sample() # sample from it
    log_prob = pd.log_prob(action)
    return action, log_prob

# task 3 * Make an episode reward processing function to turn one-step rewards into discounted rewards-to-go:
#R(s_1) = sum_{t=1} gamma^{t-1} r_t, which is the discounted reward, starting from the state, s_1.   
def dis_r_to_go(r_batch, gamma):
    r_togo = np.zeros(len(r_batch))
    R=0
    for i in reversed(range(len(r_batch))):
        R = r_batch[i] + gamma*R
        r_togo[i] = R
            
    return r_togo
 
    
# task 4 Start the model by implementing a vanilla policy gradient agent, where the gradient ascent steps
#are done with the average of the gradient of log-likelihood over a trajectory weight by rewards-to-go   
#from each state. Try different step sizes in the gradient ascent.
def run_trajectory(env_name, num_trajectory, gamma, learning_rate):
    env = gym.make(env_name)
    batch_size=500
    input_size=env.observation_space.shape[0]
    output_size=env.action_space.shape[0]
    hidden=32
    policy = PolicyNetwork(input_size, output_size, hidden) #init policy function
    optimizer = Adam(policy.parameters(), lr=learning_rate)
     
    obs_batch = [] # stores states
    traj_r_batch = [] # all rewards in a trajectory
    act_batch = [] # action batch
    r_tg_batch=[] # reward to go batch
    log_batch=[] # log prob batch
    loss_batch=[] # loss batch
    
    count = 0
    

    # number of trajectories
    for eps in range(num_trajectory):
        obs=env.reset() # restart
        done=False
        traj_r = []
        #state=obs
        
        while (count<batch_size): # time steps
            action, log_prob = choose_action(policy, obs)
            act_batch.append(action)
            log_batch.append(log_prob)
            obs_batch.append(obs)
            
            obs, reward, done, info = env.step(action)
            traj_r.append(reward)
            count+=1
            
        traj_r = np.array(traj_r)
        traj_r_tg = np.zeros_like(traj_r)
        traj_r_tg = dis_r_to_go(traj_r, 0.95) # r to go
                
        r_tg_batch.append(traj_r_tg) 
        
        
    obs_batch = torch.tensor(obs_batch, dtype=torch.float, requires_grad=True)
    act_batch = torch.tensor(act_batch, dtype=torch.float,requires_grad=True)
    r_tg_batch = torch.tensor(r_tg_batch, dtype=torch.float,requires_grad=True)
    log_batch = torch.tensor(log_batch, dtype=torch.float,requires_grad=True)
    
    
    loss = -(log_batch * r_tg_batch).mean() # gradient ascent
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
   

    print(loss.item())   
    return obs_batch, act_batch, r_tg_batch, log_batch, loss.item()
     

'''Task 5: * Pendulum is a continuous action space environment. 
Check out the example in `Modules.py` for torch implementation of the Gaussian module.  
(if you work in Julia, speak with me regarding the pendulum dynamics in Julia, and Flux for DNNs.)'''
    

'''Task 6: Add a feed-forward network for the critic, accepting the state, s=[sin(angle), cos(angle), angular velocity], 
and returning a scalar for the value of the state, s.'''
def CriticNet(state):
    #input_size = state.shape
    input_size=3
    output_size = 1
    critic = CriticNetwork(input_size, output_size)
    scalar = critic(torch.FloatTensor(state))
    #scalar = critic(state)
    return scalar.item()

# calculate values of state of actor network
def calc_value(state_batch):
    l = state_batch.size(dim=0)
    value_batch = np.zeros(l)
    my_scalar = 0
    for i in range(l):
        my_scalar = CriticNet(state_batch[i])
        value_batch[i] = my_scalar
        #print(value_batch[i])
    value_batch = torch.tensor(value_batch, dtype=torch.float)
    return value_batch
    
    
'''Task 7 Implement the generalized advantage, see Eq11-12 in the PPO paper, to be used instead of rewards-to-go.'''
# advantage estimator
def generalized_advantage(r_batch, value, gamma=0.99, lamd=0.95):
    gae = 0
    b_size = r_batch.size(dim=1)
    adv = np.zeros(b_size) # batch for generalized advantage
    next_value = 0

    for t in reversed(range(b_size)): # 
        b_error = r_batch[0][t] + gamma * next_value - value[t]
        gae = b_error + gae * gamma * lamd
        next_value = value[t]
        adv[t] = gae
        
        #print(gae)
    adv = torch.tensor(adv, dtype=torch.float)

    return adv



'''Task 8  Implement the surrogate objective for the policy gradient, see Eq7, with and without clipping. '''
def surrogate_obj(log_batch, A_head, epsilon):
    
    cpi = np.zeros(log_batch.size(dim=0))
    sur_obj = np.zeros(log_batch.size(dim=0))
    for i in reversed(range(log_batch.size(dim=0))):
        if (i>0):
            cur_policy = log_batch[i]
            ratio = cur_policy/log_batch[i-1]
            cpi[i] = ratio * A_head[i] # conservative policy iteration
            clip_ratio = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
            sur_obj[i] = torch.min(cpi[i], (clip_ratio*A_head[i]))
            #print(type(cpi[i]))
            #print(type(cpi))
    
    cpi = torch.tensor(cpi, dtype=torch.float)
    sur_obj = torch.tensor(sur_obj, dtype=torch.float)
    return sur_obj, cpi


'''Task 9 Implement the total loss, see Eq9 in the PPO. '''
def PPO_eq9():
    
     
    
    return 0


'''Task 10 Combine all together to Algorithm 1 in the PPO paper. 
(In your basic implementation, you can collect data with a single actor, N=1)'''
def algorithm_1():
    #for iteration in range():
     #   for actor in range():
            
    
    return 0


if __name__ == "__main__":
    mybatch=[1,3,5,7,9]
    #interaction_loop()
    #test_experience_relay_buffer()
    
    
    name = 'Pendulum-v1-custom'
    
    s_batch, a_batch, togo_batch, log_batch, loss = run_trajectory(name, 1, 0.95, 0.01)
    
    #criticNet(s_batch)
    
    
    
    #res=dis_r_to_go(mybatch, 0.9)
    
    
    
    

-137.69229125976562


  traj_r = np.array(traj_r)
  traj_r = np.array(traj_r)


In [26]:
s_batch

tensor([[ 0.7289, -0.6846,  0.3521],
        [ 0.7197, -0.6942, -0.2669],
        [ 0.7021, -0.7121, -0.5023],
        ...,
        [ 0.2781,  0.9606,  6.2948],
        [-0.0754,  0.9972,  7.1449],
        [-0.4577,  0.8891,  8.0000]], requires_grad=True)

In [27]:
loss

-137.69229125976562

In [28]:
value=calc_value(s_batch)

In [29]:
type(value)

torch.Tensor

In [21]:
#value

In [30]:
c=generalized_advantage(togo_batch, value, gamma=0.99, lamd=0.95)

In [31]:
c

tensor([-1587.7316, -1595.4407, -1598.9510, -1598.4623, -1593.3239, -1583.2383,
        -1568.0068, -1547.2717, -1521.4280, -1491.1184, -1457.3116, -1420.6160,
        -1383.7124, -1348.9764, -1319.8510, -1298.3645, -1284.8042, -1277.1028,
        -1273.0343, -1272.2870, -1272.3955, -1272.9189, -1271.9141, -1268.9266,
        -1263.2604, -1253.9785, -1240.8010, -1223.3254, -1201.3940, -1175.1576,
        -1144.8004, -1111.2556, -1075.9952, -1041.3121, -1010.0479,  -986.0060,
         -971.9357,  -967.9603,  -972.6788,  -984.5463, -1002.2635, -1024.4709,
        -1050.3668, -1079.0642, -1109.5378, -1141.7629, -1174.9819, -1208.8773,
        -1243.8253, -1279.1874, -1314.6576, -1350.5012, -1386.3195, -1421.8701,
        -1457.3545, -1492.1636, -1525.9115, -1558.9495, -1590.5607, -1620.8546,
        -1649.3738, -1676.0085, -1700.2755, -1721.5626, -1739.9891, -1755.0481,
        -1766.3724, -1773.6063, -1776.4425, -1774.4092, -1767.5999, -1755.3102,
        -1738.5145, -1718.0291, -1693.61

In [32]:
surrogate_obj(log_batch, c, 0.2)

TypeError: min() received an invalid combination of arguments - got (numpy.float64, Tensor), but expected one of:
 * (Tensor input)
 * (Tensor input, Tensor other, *, Tensor out)
 * (Tensor input, int dim, bool keepdim, *, tuple of Tensors out)
 * (Tensor input, name dim, bool keepdim, *, tuple of Tensors out)
