In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Policy Network (Actor)
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
        )
    
    def forward(self, state):
        return torch.tanh(self.net(state))  # Continuous actions

# Q-Network (Critic)
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
        )
    
    def forward(self, state, action):
        return self.net(torch.cat([state, action], dim=-1))

# Adaptive Entropy Coefficient (α)
class EntropyCoefficient(nn.Module):
    def __init__(self, init_alpha=0.2):
        super().__init__()
        self.log_alpha = nn.Parameter(torch.tensor(np.log(init_alpha)))
    
    def forward(self):
        return torch.exp(self.log_alpha)

In [None]:
class MetaPolicy(nn.Module):
    def __init__(self, state_dim, goal_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, goal_dim),
        )
    
    def forward(self, state):
        return self.net(state)  # Output sub-goal

# Hindsight Experience Replay (HER)
def relabel_trajectory(trajectory, achieved_goals):
    new_goals = achieved_goals[-1].unsqueeze(0)  # Use final achieved goal as new target
    relabeled_trajectory = []
    for transition in trajectory:
        state, action, reward, next_state, done = transition
        new_reward = compute_reward(next_state, new_goals)  # Custom reward function
        relabeled_trajectory.append((state, action, new_reward, next_state, done))
    return relabeled_trajectory

In [None]:
def compute_loss(q_net, policy_net, alpha, states, actions, rewards, next_states, dones, gamma=0.99):
    with torch.no_grad():
        next_actions = policy_net(next_states)
        next_q = q_net(next_states, next_actions)
        target_q = rewards + gamma * (1 - dones) * next_q
    
    current_q = q_net(states, actions)
    q_loss = nn.MSELoss()(current_q, target_q)
    
    # Policy Loss with Entropy Regularization
    new_actions = policy_net(states)
    new_q = q_net(states, new_actions)
    entropy = -alpha * policy_net.log_prob(new_actions)  # Assuming policy outputs log_std
    policy_loss = -(new_q + entropy).mean()
    
    # Entropy Coefficient Loss
    alpha_loss = -alpha * (policy_net.log_prob(new_actions) + target_entropy).detach().mean()
    
    return q_loss, policy_loss, alpha_loss

In [None]:
# Initialize networks
policy_net = PolicyNetwork(state_dim, action_dim)
q_net1 = QNetwork(state_dim, action_dim)
q_net2 = QNetwork(state_dim, action_dim)  # Double Q-learning
alpha_net = EntropyCoefficient()
meta_policy = MetaPolicy(state_dim, goal_dim)

optimizer_policy = optim.Adam(policy_net.parameters(), lr=3e-4)
optimizer_q = optim.Adam(list(q_net1.parameters()) + list(q_net2.parameters()), lr=3e-4)
optimizer_alpha = optim.Adam([alpha_net.log_alpha], lr=3e-4)

for episode in range(num_episodes):
    # Collect trajectory using hierarchical policy
    trajectory, achieved_goals = generate_trajectory(policy_net, meta_policy, env)
    relabeled_trajectory = relabel_trajectory(trajectory, achieved_goals)
    
    # Update Q-networks
    for state, action, reward, next_state, done in relabeled_trajectory:
        q_loss1 = compute_loss(q_net1, ...)
        q_loss2 = compute_loss(q_net2, ...)
        optimizer_q.zero_grad()
        (q_loss1 + q_loss2).backward()
        optimizer_q.step()
    
    # Update policy and entropy coefficient
    policy_loss, alpha_loss = compute_loss(...)
    optimizer_policy.zero_grad()
    policy_loss.backward()
    optimizer_policy.step()
    optimizer_alpha.zero_grad()
    alpha_loss.backward()
    optimizer_alpha.step()

In [None]:
# Environment Setup
env = MultiValleyMountainCar()  # Custom environment
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

# Training
train(env, policy_net, q_net1, q_net2, alpha_net, meta_policy)

# Evaluation
mean_reward = evaluate(policy_net, env, num_episodes=100)
print(f"Mean Reward: {mean_reward}")