# Importing the necessary libraries

In [None]:
import gym
from env.custom_hopper import *
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.env_util import make_vec_env

# Defining the neural architecture for value function and agent function

In [None]:

def discount_rewards(r, gamma):
    discounted_r = torch.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size(-1))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

def bootstrapped_discount_rewards(r, gamma, done, next_values):
    bootstrapped_discounted_r = torch.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size(-1))):
        if done[t]:
            running_add = 0
        else:
            running_add = r[t] + gamma * next_values[t]
        bootstrapped_discounted_r[t] = running_add
    return bootstrapped_discounted_r


In [None]:
class Policy(torch.nn.Module):
    def __init__(self, state_space, action_space):
        super(Policy, self).__init__()
        self.state_space = state_space
        self.action_space = action_space
        self.hidden = 64
        self.tanh = torch.nn.Tanh()

        self.embedding_ac = torch.nn.Linear(state_space, 512)
        self.relu = torch.nn.ReLU()
        self.fc1_ac = torch.nn.Linear(512, 2048)
        self.lstm_ac = torch.nn.LSTM(2048, 1024, batch_first=True)
       
        self.fc2_actor = torch.nn.Linear(1024, action_space)
        self.fc2_critic = torch.nn.Linear(1024, 1)

        self.sigma_activation = F.softplus
        init_sigma = 0.5
        self.sigma = torch.nn.Parameter(torch.zeros(self.action_space) + init_sigma)

        self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.normal_(m.weight)
                torch.nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.embedding_ac(x)
        x = torch.sum(x, dim=1)
        x = self.relu(x)
        x = self.fc1_ac(x)
        x = self.relu(x)
        x, _ = self.lstm_ac(x.unsqueeze(0))

        action_mean = self.fc2_actor(x.squeeze(0))
        action_sigma = self.sigma_activation(self.sigma)
        normal_dist = Normal(action_mean, action_sigma)

        value = self.fc2_critic(x.squeeze(0))
        
        return normal_dist, value

To ensure that our AC is compatible with SB3's PPO, we create a wrapper for our agent by extending ActorCrticPolicy

In [None]:
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomActorCriticPolicy, self).__init__(*args, **kwargs)
        self.policy = Policy(self.observation_space.shape[0], self.action_space.shape[0])

    def _build_mlp_extractor(self):
        pass
    #Since SB3's ActorCriticPolicy expects an MLP (multi-layer perceptron) feature extractor to be defined  we override the method with a pass statement because we are directly using your custom Policy network, which includes the feature extraction and action/value prediction.

        

    def forward(self, obs):
        normal_dist, value = self.policy(obs)
        return normal_dist.mean, normal_dist.stddev, value

    def _predict(self, obs, deterministic=False):
        normal_dist, value = self.policy(obs)
        if deterministic:
            action = normal_dist.mean
        else:
            action = normal_dist.sample()
        action_log_prob = normal_dist.log_prob(action).sum()
        return action, value, action_log_prob

To include L2 regularization - also know as weight decay- in the PPO setup, you need to add weight decay to the optimizer. Stable Baselines3 doesn't provide a direct way to specify weight decay in the high-level PPO class, but you can customize the policy_kwargs to include it in the optimizer setup.



In [None]:
class CustomAdam(torch.optim.Adam):
    def __init__(self, params, lr, weight_decay):
        super(CustomAdam, self).__init__(params, lr=lr, weight_decay=weight_decay)


In [None]:
def make_optimizer(policy, learning_rate):
        # Extract actor and critic parameters
        actor_params = [param for name, param in policy.named_parameters() if 'actor' in name]
        critic_params = [param for name, param in policy.named_parameters() if 'critic' in name]
        
        # Create optimizers with weight decay for L2 regularization
        optimizer_actor = CustomAdam(actor_params, lr=learning_rate, weight_decay=1e-6)
        optimizer_critic = CustomAdam(critic_params, lr=learning_rate, weight_decay=1e-6)
        
        return optimizer_actor, optimizer_critic

In [None]:
policy_kwargs = dict(
        features_extractor_class=CustomActorCriticPolicy,
    )

In [None]:
model = PPO(
        "MlpPolicy",
        #env,
        policy_kwargs=policy_kwargs,
        gamma=0.998,
        gae_lambda=0.95,
        ent_coef=0.01,  # To vary this, you'd need to use a schedule
        clip_range=0.2,
        n_steps=5120 * 10,
        batch_size=5120,
        n_epochs=3,
        learning_rate=3e-4,  # To vary this, you'd need to use a schedule
        vf_coef=1,
        max_grad_norm=0.5,
        tensorboard_log="./ppo_hopper_tensorboard/",
        verbose=1,
    )