In [68]:
!pip install -r https://raw.githubusercontent.com/malkiAbdelhamid/Advanced-Deep-Learning-2023-2024-esisba/master/lab5_Proximal_Policy_Optimization/requirements_lab5.txt



In [69]:
import argparse
import pickle
from collections import namedtuple,deque
from itertools import count

import os, time
import numpy as np
import matplotlib.pyplot as plt

import gymnasium as gym3
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal, Categorical
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
from tensorboardX import SummaryWriter

In [70]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [71]:
env_id = "HalfCheetah-v4"
# Create the env
env = gym3.make(env_id)

# Create the evaluation env
eval_env = gym3.make(env_id)

# Get the state space and action space
s_size = env.observation_space.shape[0]
a_size = env.action_space.shape[0]

In [72]:
class Actor(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.action_head = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.action_head(x)
        return F.softmax(x, dim=1)

    def act(self, state,actions=None):

        logits = self.forward(state).cpu()
        m = torch.distributions.Normal(logits, 0.1)

        if actions is None:
           actions = m.sample()

        return actions.detach().numpy().squeeze(0), m.log_prob(actions)


In [73]:
#Using a neural network to learn state value
class Critic(nn.Module):

    #Takes in state
    def __init__(self, s_size, h_size):
        super(Critic, self).__init__()

        self.input_layer = nn.Linear(s_size, h_size)
        self.output_layer = nn.Linear(h_size, 1)

    def forward(self, x):
        #input layer
        x = self.input_layer(x)
        #activiation relu
        x = F.relu(x)
        #get state value
        state_value = self.output_layer(x)

        return state_value

In [74]:
def generate_trajectory(actor, critic, max_t):

        buffer = []
        Experience=namedtuple('experience', ['state', 'action',  'old_log_prob', 'reward',  'next_state','state_val'])

        state,_ = env.reset()
        for t in range(max_t):
            state_tensor=torch.from_numpy(state).float().unsqueeze(0).to(device)

            action, log_prob = actor.act(state_tensor)

            state_val = critic.forward(state_tensor)

            next_state, reward,done, _,_ = env.step(action)

            buffer.append( Experience(state, action, log_prob, reward, next_state,state_val))

            state=next_state

            if done:

                break

        return  buffer

In [75]:
def computer_cumulative_reward(rewards, max_t,gamma):

        returns =deque(maxlen=len(rewards))

        for t in range(len(rewards))[::-1]:
          disc_return_t = (returns[0] if len(returns)>0 else 0)
          returns.appendleft( rewards[t]+gamma*disc_return_t)
        return returns

In [76]:
def returns_standardization(returns):
        eps = np.finfo(np.float32).eps.item()
        ## eps is the smallest representable float, which is
        # added to the standard deviation of the returns to avoid numerical instabilities
        returns = torch.tensor(returns,dtype=torch.float)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        return returns

In [77]:
#['state', 'action',  'old_log_prob', 'reward',  'next_state','state_val']
def train_actor_critic(actor, critic, actorOptimizer,criticOptimizer, buffer, returns, clip_param, writer,training_step):


        states = [t.state for t in buffer]
        actions = torch.tensor([t.action for t in buffer],).view(-1, 1) # !!!
        old_log_probs = torch.stack([t.old_log_prob for t in buffer]).squeeze(1).detach()
        state_values = torch.tensor([t.state_val for t in buffer], dtype=torch.float, requires_grad=True)

        #state_values= torch.stack([t.state_val for t in buffer]).squeeze()
        actor_loss=[]
        for  state, action,old_log_prob, Gt, state_value  in zip(states, actions,old_log_probs, returns, state_values):
            state=torch.from_numpy(state).float().unsqueeze(0).to(device)
            advantage = Gt - state_value.item()

            _,new_log_prob= actor.act(state, action)

# # Compute the ratio and surrogate loss
#         ratio = torch.exp(new_log_prob - old_log_prob)
#         surr1 = ratio * advantage
#         surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage


            ratio = new_log_prob / old_log_prob
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param)

            # update actor network
            actor_loss.append(-torch.min(surr1, surr2))



        actor_loss = torch.cat(actor_loss).mean()

        writer.add_scalar('loss/action_loss', actor_loss, global_step=training_step)
        actorOptimizer.zero_grad()
        actor_loss.backward()
        actorOptimizer.step()

        #update critic network

        value_loss = F.mse_loss(state_values, returns)
        writer.add_scalar('loss/value_loss', value_loss, global_step=training_step)
        criticOptimizer.zero_grad()
        value_loss.backward()
        criticOptimizer.step()
        training_step += 1

        return training_step



In [78]:
def PPO(actor,critic, actorOptimizer,criticOptimizer, n_training_episodes,  max_t, gamma,clip_param, writer, print_every):
    # Help us to calculate the score during the training
    scores_deque = deque(maxlen=100)
    scores = []
    training_step=0
    for i_episode in range(1, n_training_episodes+1):

        # Generate an episode
        buffer=generate_trajectory(actor,critic,max_t)
        rewards = [t.reward for t in buffer]

        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        # calculate the return
        returns= computer_cumulative_reward(rewards,max_t,gamma)

        ## standardization of the returns is employed to make training more stable
        returns=returns_standardization(returns)

        # Train Actor and Critic networks
        training_step=train_actor_critic(actor, critic, actorOptimizer,criticOptimizer, buffer, returns, clip_param, writer,training_step)





        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))


    return scores

In [79]:
HalfCheetah_hyperparameters = {
    "h_size": 64,
    "n_training_episodes": 300,
    "n_evaluation_episodes": 10,
    "max_t": 500,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
    "clip_param":0.2
}

In [80]:
# Create actor and place it to the device
HalfCheetah_actor = Actor(HalfCheetah_hyperparameters["state_space"], HalfCheetah_hyperparameters["action_space"], HalfCheetah_hyperparameters["h_size"]).to(device)

HalfCheetah_actorOptimizer = optim.Adam(HalfCheetah_actor.parameters(), lr=HalfCheetah_hyperparameters["lr"])

In [81]:
# Create critic and place it to the device
HalfCheetah_critic = Critic(HalfCheetah_hyperparameters["state_space"], HalfCheetah_hyperparameters["h_size"]).to(device)
HalfCheetah_criticOptimizer = optim.Adam(HalfCheetah_critic.parameters(), lr=HalfCheetah_hyperparameters["lr"])

In [82]:
run_name = f"__{int(time.time())}"
writer = SummaryWriter(f"runs/{run_name}")

In [83]:
scores = PPO(HalfCheetah_actor, HalfCheetah_critic,
                   HalfCheetah_actorOptimizer,HalfCheetah_criticOptimizer,
                   HalfCheetah_hyperparameters["n_training_episodes"],
                   HalfCheetah_hyperparameters["max_t"],
                   HalfCheetah_hyperparameters["gamma"],
                   HalfCheetah_hyperparameters["clip_param"],
                   writer,
                   10)

Episode 10	Average Score: -16.28
Episode 20	Average Score: -17.16
Episode 30	Average Score: -17.75
Episode 40	Average Score: -18.03
Episode 50	Average Score: -17.65
Episode 60	Average Score: -16.94
Episode 70	Average Score: -16.77
Episode 80	Average Score: -16.87
Episode 90	Average Score: -16.98
Episode 100	Average Score: -17.03
Episode 110	Average Score: -17.15
Episode 120	Average Score: -17.08
Episode 130	Average Score: -18.03
Episode 140	Average Score: -18.68
Episode 150	Average Score: -19.68
Episode 160	Average Score: -20.90
Episode 170	Average Score: -22.69
Episode 180	Average Score: -25.31
Episode 190	Average Score: -25.55
Episode 200	Average Score: -25.61
Episode 210	Average Score: -25.50
Episode 220	Average Score: -25.38
Episode 230	Average Score: -24.10
Episode 240	Average Score: -23.01
Episode 250	Average Score: -21.65
Episode 260	Average Score: -20.47
Episode 270	Average Score: -18.61
Episode 280	Average Score: -15.63
Episode 290	Average Score: -15.11
Episode 300	Average Sco