In [None]:
import gym
import torch
from a3_gym_env.envs.pendulum import CustomPendulumEnv
import numpy as np

from torch import optim

from torch.distributions import Normal

from torch.optim import Adam
from torch.distributions.multivariate_normal import MultivariateNormal
from Modules import PolicyNetwork, ExperienceReplayBuffer, CriticNetwork

# [DONE] Task 1: Start by implementing an environment interaction loop. You may refer to homework 1 for inspiration.
# [ ] Task 2: Create and test an experience replay buffer with a random policy, which is the Gaussian distribution with arbitrary (randomly initialized) weights of the policy feed-forward network,receiving state, s, and returning the mean, mu(s) and the log_std, log_stg(s) (natural logarithm of the standard deviation) of actions.  As mentioned above, you can use a state-independent standard variance.
# [ ] Task 3: Make an episode reward processing function to turn one-step rewards into discounted rewards-to-go: R(s_1) = sum_{t=1} gamma^{t-1} r_t, which is the discounted reward, starting from the state, s_1.
# [ ] Task 4: Start the model by implementing a vanilla policy gradient agent, where the gradient ascent stepsare done with the average of the gradient of log-likelihood over a trajectory weight by rewards-to-go   from each state. Try different step sizes in the gradient ascent.
# [ ] Task 5: Pendulum is a continuous action space environment. Check out the example in `Modules.py` for torch implementation of the Gaussian module.  (if you work in Julia, speak with me regarding the pendulum dynamics in Julia, and Flux for DNNs.)
# [ ] Task 6: Add a feed-forward network for the critic, accepting the state, s=[sin(angle), cos(angle), angular velocity], and returning a scalar for the value of the state, s.
# [ ] Task 7: Implement the generalized advantage, see Eq11-12 in the PPO paper, to be used instead of rewards-to-go.
# [ ] Task 8: Implement the surrogate objective for the policy gradient, see Eq7, without and without clipping.
# [ ] Task 9: Implement the total loss, see Eq9 in the PPO.
# [ ] Task 10: Combine all together to Algorithm 1 in the PPO paper. (In your basic implementation, you can collect data with a single actor, N=1)
# [ ] Task 11: You should see progress with default hyperparameters, but you can try tuning those to see how it will improve your results.

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps")


def interaction_loop():
    env = gym.make("Pendulum-v1-custom")
    # sample hyperparameters
    batch_size = 10000
    epochs = 30
    learning_rate = 1e-2
    hidden_size = 8
    n_layers = 2

    # optimizer = optim.Adam(vae.parameters(), lr=learning_rate)

    max_step = 1000
    obs = env.reset()
    for _ in range(max_step):
        # get a random action in this environment
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        # render already plots the graph for you, no need to use plt
        img = env.render()
        if done:
            obs = env.reset()
    env.close()

    
# task 2 * Create and test an experience replay buffer with a random policy, which is the 
#Gaussian distribution with arbitrary (randomly initialized) weights of the policy feed-forward network,
#receiving state, s, and returning the mean, mu(s) and the log_std, log_stg(s) 
#(natural logarithm of the standard deviation) of actions.  As mentioned above, you can use 
#a state-independent standard variance.
def test_experience_relay_buffer():
    env = gym.make("Pendulum-v1-custom")
    # sample hyperparameters
    batch_size = 1000
    epochs = 30
    learning_rate = 0.01
    hidden_size = 8
    n_layers = 2
    
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    max_step = 1000
    
    policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
    
    memory = ExperienceReplayBuffer(batch_size)

    for _ in range(max_step):
        # get a random action in this environment
        action = env.action_space.sample()

        obs, reward, done, info = env.step(action)
        next_state = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
        memory.push(state, action, next_state, reward)

        gaus_param = policy(state)
        
        # print(gaus_param)
        state = next_state
        # render already plots the graph for you, no need to use plt
        img = env.render()
        if done:
            obs = env.reset()
    
    env.close()


# Sample an action from Normal distribution
def choose_action(policy, state):  
    
    state = torch.from_numpy(state).float().unsqueeze(0)
    act_prob, v = policy(state)
    v = torch.exp(v)
    pd = Normal(act_prob, torch.sqrt(v)) # get normal distirbution
    action = pd.sample() # sample from it
    log_prob = pd.log_prob(action)
    return action, log_prob

# task 3 * Make an episode reward processing function to turn one-step rewards into discounted rewards-to-go:
#R(s_1) = sum_{t=1} gamma^{t-1} r_t, which is the discounted reward, starting from the state, s_1.   
def dis_r_to_go(r_batch, gamma):
    r_togo = np.zeros(len(r_batch))
    R=0
    for i in reversed(range(len(r_batch))):
        R = r_batch[i] + gamma*R
        r_togo[i] = R
            
    return r_togo
 
    
# task 4 Start the model by implementing a vanilla policy gradient agent, where the gradient ascent steps
#are done with the average of the gradient of log-likelihood over a trajectory weight by rewards-to-go   
#from each state. Try different step sizes in the gradient ascent.
def run_trajectory(env_name, num_trajectory, gamma, learning_rate):
    env = gym.make(env_name)
    input_size=env.observation_space.shape[0]
    output_size=env.action_space.shape[0]
    hidden=32
    policy = PolicyNetwork(input_size, output_size, hidden) #init policy function
    optimizer = Adam(policy.parameters(), lr=learning_rate)
     
    obs_batch = [] # stores states
    traj_r_batch = [] # all rewards in a trajectory
    act_batch = [] # action batch
    r_tg_batch=[] # reward to go batch
    log_batch=[] # log prob batch
    loss_batch=[] # loss batch
    
    count = 0
    

    # number of trajectories
    for eps in range(num_trajectory):
        obs=env.reset() # restart
        done=False
        traj_r = []
        #state=obs
        
        while (done==False): # time steps
            action, log_prob = choose_action(policy, obs)
            act_batch.append(action)
            log_batch.append(log_prob)
            obs_batch.append(obs)
            
            obs, reward, done, info = env.step(action)
            traj_r.append(reward)
            
        traj_r = np.array(traj_r)
        traj_r_tg = np.zeros_like(traj_r)
        traj_r_tg = dis_r_to_go(traj_r, 0.95) # r to go
                
        r_tg_batch.append(traj_r_tg) 
        
        
    obs_batch = torch.tensor(obs_batch, dtype=torch.float)
    #act_batch = torch.tensor(act_batch, dtype=torch.int64)
    r_tg_batch = torch.tensor(r_tg_batch, dtype=torch.float)
    
    optimizer.zero_grad()
    loss = -(log_batch * traj_r_batch).mean() # gradient ascent
    loss.backward()
    optimizer.step()

        
    return obs_batch, act_batch, r_tg_batch, loss.item()
     

'''Task 5: * Pendulum is a continuous action space environment. 
Check out the example in `Modules.py` for torch implementation of the Gaussian module.  
(if you work in Julia, speak with me regarding the pendulum dynamics in Julia, and Flux for DNNs.)'''
    

'''Task 6: Add a feed-forward network for the critic, accepting the state, s=[sin(angle), cos(angle), angular velocity], 
and returning a scalar for the value of the state, s.'''
def CriticNet(state):
    input_size = state.shape[1]
    output_size = 1
    critic = CriticNetwork(input_size, output_size)
    scalar = critic(torch.FloatTensor(state))
    return scalar.item()
    
    
'''Task 7 Implement the generalized advantage, see Eq11-12 in the PPO paper, to be used instead of rewards-to-go.'''

def generalized_advantage(r_batch, value, mask, gamma=0.99, lamd=0.95):
    gae = 0
    adv = np.zeros(len(rewards))
    next_value = 0

    for t in reversed(len(rewards)): # 
        b_error = r_batch[t] + gamma * next_value - value[t]
        gae = b_error + gae * gamma * lamd
        next_value = value[t]
        adv.append(gae)
        
    adv = torch.tensor(adv)

    return adv


     

'''Task 8  Implement the surrogate objective for the policy gradient, see Eq7, with and without clipping. '''
def surrogate_obj():
    
    
    
    return


'''Task 9 Implement the total loss, see Eq9 in the PPO. '''
def PPO_eq9():
    
    
    
    
    return 


'''Task 10 Combine all together to Algorithm 1 in the PPO paper. 
(In your basic implementation, you can collect data with a single actor, N=1)'''
def algorithm_1():
    
    
    return


if __name__ == "__main__":
    mybatch=[1,3,5,7,9]
    #interaction_loop()
    #test_experience_relay_buffer()
    
    
    name = 'Pendulum-v1-custom'
    s_batch, a_batch, togo_batch = run_trajectory(name, 1, 0.95, 0.01)
    
    #criticNet(s_batch)
    
    
    
    #res=dis_r_to_go(mybatch, 0.9)
    
    

In [2]:
env = gym.make('Pendulum-v1-custom')
#policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.shape[0])

obs=env.reset()
obs = torch.from_numpy(obs).float().unsqueeze(0)
#x1,x2 = policy(obs)
#x1

  logger.warn(


In [3]:
s=CriticNet(obs)

In [4]:
s

0.17903950810432434

In [2]:
import torch
torch.zeros(1, 1)


tensor([[0.]])

In [5]:
import numpy as np
r_togo = np.zeros(3)