In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
from collections import deque
import matplotlib.pyplot as plt
import glfw

observation = val_env.step(val_env.action_space.sample())[0]

reward = val_env.step(val_env.action_space.sample())[1]

terminated = val_env.step(val_env.action_space.sample())[2] # bool

truncated = val_env.step(val_env.action_space.sample())[3] # bool

info = val_env.step(val_env.action_space.sample())[4] # dict

dict_keys(['reward_linvel', 'reward_quadctrl', 'reward_alive', 'x_position', 'y_position', 'distance_from_origin', 'x_velocity', 'y_velocity', 'forward_reward'])

### Actor

In [2]:
class Actor(nn.Module) :
    def __init__(self, input_size, hidden_size, n_actions, init_weight, min_act_value, lr, device=torch.device('cpu')):
        super(Actor, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_actions = n_actions
        self.init_weight = init_weight
        self.min_act_value = min_act_value
        self.lr = lr
        self.device = device

        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

        self.fc1 = nn.Linear(
            in_features=self.input_size,
            out_features=self.hidden_size
        )

        self.fc2 = nn.Linear(
            in_features=self.hidden_size,
            out_features=self.hidden_size
        )

        self.mean_fc3 = nn.Linear(
            in_features= self.hidden_size,
            out_features=self.n_actions
        )

        #self.mean_fc3.weight.data.uniform_(-self.init_weight, self.init_weight)
        #self.mean_fc3.bias.data.uniform_(-self.init_weight, self.init_weight)

        self.log_std_fc3 = nn.Linear(
            in_features= self.hidden_size,
            out_features=self.n_actions
        )

        #self.log_std_fc3.weight.data.uniform_(-self.init_weight, self.init_weight)
        #self.log_std_fc3.bias.data.uniform_(-self.init_weight, self.init_weight)

        self.optimizer = optim.Adam(params=self.parameters(), lr=self.lr)

    def forward(self, x):

        x = x.float() # from float64 to float32

        x = self.relu(self.fc1(x))

        x = self.relu(self.fc2(x))

        return self.mean_fc3(x), self.log_std_fc3(x)   # mean, log_std
    
    def act(self, state, greedy=False):

        mean, log_std = self.forward(state)

        action = self.tanh(mean)

        if not greedy: # explore

            gaussian = Normal(0, 1)
            z = gaussian.sample()
            action = self.tanh(mean + log_std.exp()*z)

        # clamp obtained values to [-0.4, 0.4]

        action_clamped = torch.clamp(action, 
                                     min=-self.min_act_value,
                                     max=self.min_act_value)
        
        return action_clamped


### Critic

In [3]:
class Critic(nn.Module):
    def __init__(self, input_size, hidden_size, init_weight, lr, device=torch.device('cpu')):
        super(Critic, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.init_weight = init_weight
        self.lr = lr
        self.device = device

        self.relu = nn.ReLU()

        self.fc1 = nn.Linear(
            in_features=self.input_size,
            out_features=self.hidden_size
        )

        self.fc2 = nn.Linear(
            in_features=self.hidden_size,
            out_features=self.hidden_size
        )

        self.fc3 = nn.Linear(
            in_features= self.hidden_size,
            out_features= 1
        )

        self.fc3.weight.data.uniform_(-self.init_weight, self.init_weight)
        self.fc3.bias.data.uniform_(-self.init_weight, self.init_weight)

        self.optimizer = optim.Adam(self.parameters(), lr=self.lr)

    def forward(self, x):

        x = x.float() # from float64 to float32

        x = self.relu(self.fc1(x))

        x = self.relu(self.fc2(x))

        return self.fc3(x)

### Algorithm

In [4]:
class Actor_Critic(nn.Module):
    def __init__(self, input_size, hidden_size, n_actions, init_weight, min_act_value, act_lr, crt_lr, num_episodes, device=torch.device('cpu')):
        super(Actor_Critic, self).__init__()
        self.device = device

        self.num_episodes = num_episodes

        self.Actor = Actor(
            input_size= input_size,
            hidden_size= hidden_size,
            n_actions= n_actions,
            init_weight= init_weight,
            min_act_value= min_act_value,
            lr= act_lr,
            device= self.device
        )

        self.Critic = Critic(
            input_size= input_size,
            hidden_size= hidden_size,
            init_weight= init_weight,
            lr= crt_lr,
            device= self.device
        )

    def save(self):
        torch.save(self.state_dict(), 'model.pt')

    def load(self):
        self.load_state_dict(torch.load('model.pt', map_location=self.device))

    def to(self, device):
        ret = super().to(device)
        ret.device = device
        return ret


    def train(self):

        E = gym.make(
            id='Humanoid-v4',
            healthy_reward=5,                       
            terminate_when_unhealthy=True,          
            healthy_z_range=(1.0, 2.0),             
            exclude_current_positions_from_observation=False,
        )

        scores = deque(maxlen=100)
        act_losses = deque(maxlen=10)
        crt_losses = deque(maxlen=10)

        for iter in range(self.num_episodes):

        # CONTAINERS

            Rewards = []
            Log_prob = []
            Values = []
            obs_next = E.reset()[0]
            done_next = False

        # ROLLOUT
            t = 0

            while not done_next:

            #obs = torch.from_numpy(obs_next)
            #done = done_next

                action = self.Actor.act(torch.from_numpy(obs_next))

                print(action)

                obs_next, reward, done, truncated, info = E.step(action.detach().numpy())

                print(f'''
                    \tNum_Rollout, Iteration: {t, iter}              
                    \tReward: {reward}  
                    \tReward_Alive: {info['reward_alive']}
                    \tX_position: {info['x_position']}
                    \tY_position: {info['y_position']}
                    \tDistance_from_Origin: {info['distance_from_origin']}
                    \tForward_Reward: {info['forward_reward']}
                    \n''')

                value = self.Critic.forward(torch.from_numpy(obs_next))

                Values.append(value)
                Rewards.append(reward)

                done_next = done | truncated

                    # check if z-coordinate of torso is inside the healthy_range

                    # 'reward_alive', 'x_position', 'y_position', 'distance_from_origin', 'x_velocity', 'y_velocity', 'forward_reward'

                t+=1 # update counter

            scores.append(sum(Rewards))

            print(np.mean(scores))

        # LEARNING

            # compute Advantage

            # compute Log_Prob

            actor_loss = torch.log(action).sum()
            critic_loss = value

            act_losses.append(actor_loss.item())
            crt_losses.append(critic_loss.item())

        # UPDATES

            # Actor
            self.Actor.optimizer.zero_grad()
            actor_loss.backward()
            self.Actor.optimizer.step()

            # Critic
            self.Critic.optimizer.zero_grad()
            critic_loss.backward()
            self.Critic.optimizer.step()

        # SAVE


            if iter % 10:

                print(f'''Episode {iter}
                    \tAverage Score: {np.mean(scores)}
                    \ttAverage Critic Loss: {np.mean(act_losses)}
                    \tAverage Actor Loss: {np.mean(crt_losses)}
                    \tLast Score: {sum(Rewards)}\n''')
                
                self.save()

        return

### Evaluation

In [5]:
def Evaluate_Humanoid(agent, n_eval_episodes):
    rewards = []

    agent.load()

    val_env = gym.make(
        id='Humanoid-v4',
        healthy_reward=5,                       # costant reward given after each timestep if the humanoid has an healthy posture
        terminate_when_unhealthy=True,          # if the humanoid has not a healthy posture, hence is out of range, then terminate
        healthy_z_range=(1.0, 2.0),              # z-coordinate of the torso the indicate if the humanoid has an healthy posture or not
        render_mode='human',
        exclude_current_positions_from_observation=False
    )

    for n_episode in range(n_eval_episodes):
        episode_rewards = 0
        obs = val_env.reset()[0]
        done = False

        while not done:
            
            action = agent.Actor.act(torch.from_numpy(obs))
            obs_next, rew, terminated, truncated, info = val_env.step(action.detach().numpy())
            done = terminated or truncated
            episode_rewards += rew

        print('Reward episode %d: %d' %(n_episode, episode_rewards))

        rewards.append(episode_rewards)

    print('Mean Reward: ', np.mean(rewards))
    glfw.terminate() # close Mujoco render

### Training and Testing

In [6]:
agent = Actor_Critic(
    input_size= 378,
    hidden_size= 32,
    n_actions= 17,
    init_weight= 3e-3,
    min_act_value= 4e-1,
    act_lr= 1e-3,
    crt_lr= 1e-3,
    num_episodes= 500
)

agent.train()
agent.save()

tensor([0.4000, 0.4000, 0.4000, 0.4000, 0.4000, 0.4000, 0.4000, 0.4000, 0.4000,
        0.4000, 0.4000, 0.4000, 0.4000, 0.4000, 0.4000, 0.4000, 0.4000],
       grad_fn=<ClampBackward1>)

                    	Num_Rollout, Iteration: (0, 0)              
                    	Reward: 4.729275738762621  
                    	Reward_Alive: 5.0
                    	X_position: 0.017738882143378584
                    	Y_position: 0.004374991263873602
                    	Distance_from_Origin: 0.01827042660300098
                    	Forward_Reward: 0.0012757468688522995
                    

tensor([-0.4000, -0.4000, -0.4000, -0.4000, -0.4000,  0.4000, -0.4000, -0.4000,
        -0.4000,  0.4000,  0.4000,  0.1426, -0.4000, -0.3490, -0.4000, -0.4000,
         0.1688], grad_fn=<ClampBackward1>)

                    	Num_Rollout, Iteration: (1, 0)              
                    	Reward: 4.7601741855892445  
                    	Reward_Alive: 5.0
                    	X_position: 0.017753714534

In [7]:
agent = Actor_Critic(
    input_size= 378,
    hidden_size= 32,
    n_actions= 17,
    init_weight= 3e-3,
    min_act_value= 4e-1,
    act_lr= 1e-3,
    crt_lr= 1e-3,
    num_episodes= 1000
)
Evaluate_Humanoid(agent, 10)

Reward episode 0: 81
Reward episode 1: 81
Reward episode 2: 76
Reward episode 3: 86
Reward episode 4: 87
Reward episode 5: 76
Reward episode 6: 113
Reward episode 7: 80
Reward episode 8: 76
Reward episode 9: 76
Mean Reward:  83.68179307400388


/home/brock/.local/lib/python3.10/site-packages/glfw/__init__.py:916: GLFWError: (65537) b'The GLFW library is not initialized'
