In [1]:
import gym
from env.custom_hopper import *
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [26]:
class Policy(nn.Module):
    def __init__(self, state_space, action_space):
        super().__init__()
        self.state_space = state_space
        self.action_space = action_space
        self.hidden = 64
        self.tanh = nn.Tanh()

        # Shared network
        self.embedding_ac = nn.Linear(state_space, 64)
        self.relu = nn.ReLU()
        self.fc1_ac = nn.Linear(64, 128) #changed from 2048 to 1024
        #self.lstm_ac = nn.LSTM(1024, 512, batch_first=True)

        # Actor network
        self.fc2_actor = nn.Linear(128, action_space)

        # Critic network
        self.fc2_critic = nn.Linear(128, 1) 

        # Learned standard deviation for exploration
        self.sigma_activation = F.softplus
        init_sigma = 0.5
        self.sigma = nn.Parameter(torch.zeros(self.action_space) + init_sigma)

        self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight) #changed form normal_ to xavier_normal_
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.embedding_ac(x)
        x = self.relu(x)
        x = self.fc1_ac(x)
        x = self.relu(x)
        #x, _ = self.lstm_ac(x.unsqueeze(0))  # Adding batch dimension for LSTM
        #x = x.squeeze(1)  # Remove sequence dimension

        # Actor
        action_mean = self.fc2_actor(x.squeeze(0))
        action_sigma = self.sigma_activation(self.sigma)
        normal_dist = Normal(action_mean, action_sigma)

        # Critic
        value = self.fc2_critic(x.squeeze(0))

        return normal_dist, value


In [27]:
def bootstrapped_discount_rewards(r, gamma, done, next_values):
    bootstrapped_discounted_r = torch.zeros_like(r)
    for t in reversed(range(0, r.size(-1))):
         if done[t]:
             bootstrapped_discounted_r[t] = r[t]
         else:
             bootstrapped_discounted_r[t] = r[t] + gamma*next_values[t]
    return bootstrapped_discounted_r

In [28]:
class Agent(object):
    def __init__(self, policy, actor_lr, critic_lr, device='cpu'):
        self.train_device = device
        self.policy = policy.to(self.train_device)
        self.optimizer_actor = torch.optim.Adam([param for name, param in policy.named_parameters() if 'actor' in name], lr=actor_lr)
        self.optimizer_critic = torch.optim.Adam([param for name, param in policy.named_parameters() if 'critic' in name], lr=critic_lr)

        self.gamma = 0.99
        self.states = []
        self.next_states = []
        self.action_log_probs = []
        self.rewards = []
        self.done = []

    def update_policy(self):
        action_log_probs = torch.stack(self.action_log_probs, dim=0).to(self.train_device).squeeze(-1)
        states = torch.stack(self.states, dim=0).to(self.train_device).squeeze(-1)
        next_states = torch.stack(self.next_states, dim=0).to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1)
        done = torch.Tensor(self.done).to(self.train_device)

        self.states, self.next_states, self.action_log_probs, self.rewards, self.done = [], [], [], [], []

        #Compute bootstrapped discount estimates
        ## compute estimated values of the current state and of the next state
        _, v_currentstate = self.policy(states)
        _, v_nextstate = self.policy(next_states)
        ## bootstrapped discounted returns r_t+gamma*V(s_t+1)*(1-done)
        bootstrapped_discount_r = bootstrapped_discount_rewards(rewards, self.gamma, done, v_nextstate)

        #Compute advantage terms (TD error)
        advantages = bootstrapped_discount_r - v_currentstate.detach()
        ## .detach() is used to ensure that the value estimates used in the advantage calculation do not contribute to the gradient
        ## computations for updating the policy.
        
        #Compute actor and critic loss
        actor_loss = -(action_log_probs * advantages.detach()).mean()
        critic_loss =  torch.mean((bootstrapped_discount_r - v_currentstate) ** 2)
        
        total_loss = actor_loss+critic_loss

        #compute gradients
        self.optimizer_actor.zero_grad()
        self.optimizer_critic.zero_grad()
        ## backproagate the total loss
        actor_loss.backward(retain_graph=True)
        critic_loss.backward()

        #Step optimizer
        self.optimizer_actor.step()
        self.optimizer_critic.step()

    def get_action(self, state, evaluation=False):
        x = torch.from_numpy(state).float().to(self.train_device)
        normal_dist, _ = self.policy(x)

        if evaluation:
            return normal_dist.mean, None
        else:
            action = normal_dist.sample()
            action_log_prob = normal_dist.log_prob(action).sum()
            return action, action_log_prob

    def store_outcome(self, state, next_state, action_log_prob, reward, done):
        self.states.append(torch.from_numpy(state).float())
        self.next_states.append(torch.from_numpy(next_state).float())
        self.action_log_probs.append(action_log_prob)
        self.rewards.append(torch.Tensor([reward]))
        self.done.append(done)

In [29]:
def train(env, agent, num_episodes, max_steps):
    episode_rewards = []

    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0

        for step in range(max_steps):
            action, action_log_prob = agent.get_action(state)
            next_state, reward, done, _ = env.step(action.numpy())
            
            agent.store_outcome(state, next_state, action_log_prob, reward, done)
            episode_reward += reward
            state = next_state

            if done:
                break

        agent.update_policy()
        episode_rewards.append(episode_reward)

        avg_reward = np.mean(episode_rewards)
        print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}")
        
    return episode_rewards

In [30]:
if __name__ == "__main__":
    env = gym.make('CustomHopper-v0')
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    actor_lr = 5e-4
    critic_lr = 1e-4

    num_episodes = 1000
    max_steps = 1000

    policy = Policy(state_dim, action_dim)
    agent = Agent(policy, actor_lr=actor_lr, critic_lr=critic_lr)
    episode_rewards = train(env, agent, num_episodes, max_steps)
    
    plt.figure(figsize=(10, 6))
    plt.plot(episode_rewards, label=f"Actor LR: {actor_lr}, Critic LR: {critic_lr}")
    plt.title(f"Training Performance")
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.legend()
    plt.tight_layout()
    plt.show()

    env.close()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x64 and 128x3)