In [1]:
!pip install setuptools wheel swig
!pip install gymnasium==1.1 gymnasium[box2d] torch numpy matplotlib moviepy -q


Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.5/965.5 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0

In [17]:
!mkdir /kaggle/working/saved_models

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import gymnasium as gym
from gymnasium.wrappers import GrayscaleObservation, ResizeObservation
import numpy as np
import os
import math
import copy
import shutil
from datetime import datetime

In [10]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, net_width):
        super(Actor, self).__init__()
        # L'input è in scala di grigi, quindi ha 1 canale
        in_channels = 1

        self.cnn_base = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        # Calcola la dimensione dell'output della CNN in modo dinamico
        with torch.no_grad():
            # state_dim è (H, W), es: (84, 84)
            dummy_input = torch.zeros(1, in_channels, *state_dim)
            cnn_out_dim = self.cnn_base(dummy_input).view(1, -1).size(1)

        self.fc1 = nn.Linear(cnn_out_dim, net_width)
        self.fc_pi = nn.Linear(net_width, action_dim)

    def forward(self, state):
        # Gestisce sia un singolo stato (H, W) che un batch (B, H, W)
        # Aggiunge la dimensione del canale per creare (B, C, H, W)
        if len(state.shape) == 3:  # Batch di stati (B, H, W)
            state = state.unsqueeze(1)
        elif len(state.shape) == 2:  # Singolo stato (H, W)
            state = state.unsqueeze(0).unsqueeze(0) # -> (1, 1, H, W)

        x = self.cnn_base(state)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc_pi(x)

    def pi(self, state, softmax_dim=1):
        logits = self.forward(state)
        probs = F.softmax(logits, dim=softmax_dim)
        return probs

class Critic(nn.Module):
    def __init__(self, state_dim, net_width):
        super(Critic, self).__init__()
        # L'input è in scala di grigi, quindi ha 1 canale
        in_channels = 1

        self.cnn_base = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        # Calcola la dimensione dell'output della CNN in modo dinamico
        with torch.no_grad():
            # state_dim è (H, W), es: (84, 84)
            dummy_input = torch.zeros(1, in_channels, *state_dim)
            cnn_out_dim = self.cnn_base(dummy_input).view(1, -1).size(1)

        self.fc1 = nn.Linear(cnn_out_dim, net_width)
        self.fc_v = nn.Linear(net_width, 1)

    def forward(self, state):
        # Gestisce sia un singolo stato (H, W) che un batch (B, H, W)
        # Aggiunge la dimensione del canale per creare (B, C, H, W)
        if len(state.shape) == 3:  # Batch di stati (B, H, W)
            state = state.unsqueeze(1)
        elif len(state.shape) == 2:  # Singolo stato (H, W)
            state = state.unsqueeze(0).unsqueeze(0) # -> (1, 1, H, W)

        x = self.cnn_base(state)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc_v(x)


def evaluate_policy(env, agent, turns=3):
    total_scores = 0
    for i in range(turns):
        s, info = env.reset()
        done = False
        episode_reward = 0
        while not done:
            a, logprob_a = agent.select_action(s, deterministic=True)
            s_next, r, dw, tr, info = env.step(a)
            done = dw or tr
            episode_reward += r
            s = s_next
        total_scores += episode_reward
    return total_scores / turns

def str2bool(v):
    if isinstance(v, bool): return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'): return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False
    else: raise argparse.ArgumentTypeError('Boolean value expected.')

In [11]:
class PPO_discrete():
    def __init__(self, **kwargs):
        # Init hyperparameters for PPO agent, just like "self.gamma = opt.gamma, self.lambd = opt.lambd, ..."
        self.__dict__.update(kwargs)
        
        # Set exploration parameters
        self.initial_entropy_coef = self.initial_entropy_coef
        self.min_entropy_coef = self.min_entropy_coef if hasattr(self, 'min_entropy_coef') else 0.01
        self.initial_explore_steps = self.initial_explore_steps if hasattr(self, 'initial_explore_steps') else 10000
        self.total_steps_taken = 0
        self.entropy_coef = self.initial_entropy_coef

        '''Build Actor and Critic'''
        self.actor = Actor(self.state_dim, self.action_dim, self.net_width).to(self.dvc)
        self.actor_optimizer = torch.optim.AdamW(self.actor.parameters(), lr=self.lr)
        self.critic = Critic(self.state_dim, self.net_width).to(self.dvc)
        self.critic_optimizer = torch.optim.AdamW(self.critic.parameters(), lr=self.lr)

        '''Build Trajectory holder'''
        self.s_hoder = np.zeros((self.T_horizon, *self.state_dim), dtype=np.float32)
        self.a_hoder = np.zeros((self.T_horizon, 1), dtype=np.int64)
        self.r_hoder = np.zeros((self.T_horizon, 1), dtype=np.float32)
        self.s_next_hoder = np.zeros((self.T_horizon, *self.state_dim), dtype=np.float32)
        self.logprob_a_hoder = np.zeros((self.T_horizon, 1), dtype=np.float32)
        self.done_hoder = np.zeros((self.T_horizon, 1), dtype=np.bool_)
        self.dw_hoder = np.zeros((self.T_horizon, 1), dtype=np.bool_)

    def select_action(self, s, deterministic):
        s = torch.from_numpy(s).float().unsqueeze(0).to(self.dvc)
        with torch.no_grad():
            pi = self.actor.pi(s, softmax_dim=0)
            if deterministic:
                a = torch.argmax(pi).item()
                return a, None
            else:
                m = Categorical(pi)
                a = m.sample().item()
                pi_a = pi[0, a].item()
                return a, pi_a

    def train(self):
        # Update total steps and manage entropy coefficient decay
        self.total_steps_taken += self.T_horizon
                
        # Decay entropy coefficient but don't go below minimum
        # at 100000 steps, entropy_coed should be 0.2
        self.entropy_coef = self.min_entropy_coef + (self.initial_entropy_coef - self.min_entropy_coef) * \
                            math.exp(-1. * self.total_steps_taken / self.entropy_coef_decay)
                            
        # self.entropy_coef *= self.entropy_coef_decay INIZIALMENTE ERA COSI
        if self.entropy_coef < self.min_entropy_coef:
            self.entropy_coef = self.min_entropy_coef
         
        # DA RIMETTERE SE SI VUOLE MANTENERE ALTA ENTROPY COEF   
        # For very early exploration, maintain high entropy coefficient
        #if total_steps_taken < self.initial_explore_steps:
        #    entropy_coef = self.initial_entropy_coef
        
        '''Prepare PyTorch data from Numpy data'''
        s = torch.from_numpy(self.s_hoder).to(self.dvc)
        a = torch.from_numpy(self.a_hoder).to(self.dvc)
        r = torch.from_numpy(self.r_hoder).to(self.dvc)
        s_next = torch.from_numpy(self.s_next_hoder).to(self.dvc)
        old_prob_a = torch.from_numpy(self.logprob_a_hoder).to(self.dvc)
        done = torch.from_numpy(self.done_hoder).to(self.dvc)
        dw = torch.from_numpy(self.dw_hoder).to(self.dvc)

        ''' Use TD+GAE+LongTrajectory to compute Advantage and TD target'''
        with torch.no_grad():
            vs = self.critic(s)
            vs_ = self.critic(s_next)

            '''dw(dead and win) for TD_target and Adv'''
            deltas = r + self.gamma * vs_ * (~dw) - vs
            deltas = deltas.cpu().flatten().numpy()
            adv = [0]

            '''done for GAE'''
            for dlt, done in zip(deltas[::-1], done.cpu().flatten().numpy()[::-1]):
                advantage = dlt + self.gamma * self.lambd * adv[-1] * (~done)
                adv.append(advantage)
            adv.reverse()
            adv = copy.deepcopy(adv[0:-1])
            adv = torch.tensor(adv).unsqueeze(1).float().to(self.dvc)
            td_target = adv + vs
            if self.adv_normalization:
                adv = (adv - adv.mean()) / ((adv.std() + 1e-4))  #sometimes helps

        """PPO update"""
        #Slice long trajectopy into short trajectory and perform mini-batch PPO update
        optim_iter_num = int(math.ceil(s.shape[0] / self.batch_size))

        for _ in range(self.K_epochs):
            #Shuffle the trajectory, Good for training
            perm = np.arange(s.shape[0])
            np.random.shuffle(perm)
            perm = torch.LongTensor(perm).to(self.dvc)
            s, a, td_target, adv, old_prob_a = \
                s[perm].clone(), a[perm].clone(), td_target[perm].clone(), adv[perm].clone(), old_prob_a[perm].clone()

            '''mini-batch PPO update'''
            for i in range(optim_iter_num):
                index = slice(i * self.batch_size, min((i + 1) * self.batch_size, s.shape[0]))

                '''actor update'''
                prob = self.actor.pi(s[index], softmax_dim=1)
                entropy = Categorical(prob).entropy().sum(0, keepdim=True)
                prob_a = prob.gather(1, a[index])
                ratio = torch.exp(torch.log(prob_a) - torch.log(old_prob_a[index]))  # a/b == exp(log(a)-log(b))

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(ratio, 1 - self.clip_rate, 1 + self.clip_rate) * adv[index]
                a_loss = -torch.min(surr1, surr2) - self.entropy_coef * entropy

                self.actor_optimizer.zero_grad()
                a_loss.mean().backward()
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 40)
                self.actor_optimizer.step()

                '''critic update'''
                c_loss = (self.critic(s[index]) - td_target[index]).pow(2).mean()
                for name, param in self.critic.named_parameters():
                    if 'weight' in name:
                        c_loss += param.pow(2).sum() * self.l2_reg

                self.critic_optimizer.zero_grad()
                c_loss.backward()
                self.critic_optimizer.step()

    def put_data(self, s, a, r, s_next, logprob_a, done, dw, idx):
        self.s_hoder[idx] = s
        self.a_hoder[idx] = a
        self.r_hoder[idx] = r
        self.s_next_hoder[idx] = s_next
        self.logprob_a_hoder[idx] = logprob_a
        self.done_hoder[idx] = done
        self.dw_hoder[idx] = dw

    def save(self, episode, train_rewards, eval_rewards):
        latest_path = "./saved_models/ppo_model.pth"
        torch.save({
            'episode': episode,
            'model_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'train_rewards': train_rewards,
            'eval_rewards': eval_rewards,
            'actor_optimizer_state_dict': self.actor_optimizer.state_dict(),
            'critic_optimizer_state_dict': self.critic_optimizer.state_dict(),
            'device': str(self.dvc),
            'total_steps_taken': self.total_steps_taken,
        }, latest_path)

    def load(self, latest_path):
        checkpoint = torch.load(latest_path, map_location=self.dvc, weights_only=False)
        self.critic.load_state_dict(checkpoint['critic_state_dict'])
        self.actor.load_state_dict(checkpoint['model_state_dict'])

        return checkpoint

In [19]:
# Sostituzione di argparse per l'uso in un notebook
class Args:
    def __init__(self):
        self.dvc = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.EnvIdex = 0
        self.write = True  # Abilita TensorBoard
        self.render = False # Disabilita il rendering a schermo
        self.Loadmodel = False
        self.ModelIdex = 0
        self.seed = 42
        self.T_horizon = 2048
        self.Max_train_steps = 5e5
        self.save_interval = 10000
        self.eval_interval = 2000
        self.gamma = 0.99
        self.lambd = 0.95
        self.clip_rate = 0.2
        self.K_epochs = 10
        self.net_width = 512
        self.lr = 2.5e-4
        self.l2_reg = 0.0
        self.batch_size = 256
        self.initial_entropy_coef = 0.001
        self.min_entropy_coef = 0.001
        self.entropy_coef_decay = 200000
        self.adv_normalization = True

opt = Args()
opt.dvc = torch.device(opt.dvc)
print(opt.__dict__)

def main():
    EnvName = ['CarRacing-v3']
    env = gym.make(EnvName[opt.EnvIdex], continuous=False, render_mode="rgb_array")
    opt.max_e_steps = env._max_episode_steps

    env = GrayscaleObservation(env)
    env = ResizeObservation(env, (84, 84))
    opt.state_dim = env.observation_space.shape
    opt.action_dim = env.action_space.n

    eval_env = gym.make(EnvName[opt.EnvIdex], continuous=False)
    eval_env = GrayscaleObservation(eval_env)
    eval_env = ResizeObservation(eval_env, (84, 84))

    torch.manual_seed(opt.seed)
    np.random.seed(opt.seed)
    print(f"Env: {EnvName[opt.EnvIdex]}, StateDim: {opt.state_dim}, ActionDim: {opt.action_dim}, Seed: {opt.seed}")

    agent = PPO_discrete(**vars(opt))
    train_rewards, eval_rewards = [], []

    if opt.Loadmodel:
        try:
            checkpoint = agent.load("/kaggle/working/saved_models/ppo_model.pth")
            train_rewards = checkpoint.get('train_rewards', [])
            eval_rewards = checkpoint.get('eval_rewards', [])
            agent.total_steps_taken = checkpoint.get('total_steps_taken', 0)
            print(f'Modello caricato dal passo {agent.total_steps_taken}...')
        except FileNotFoundError:
            print("Nessun modello salvato trovato. Inizio l'addestramento da zero.")
            opt.Loadmodel = False

    traj_lenth, total_steps = 0, 0 if not opt.Loadmodel else agent.total_steps_taken
    env_seed = opt.seed

    while total_steps < opt.Max_train_steps:
        s, info = env.reset(seed=env_seed)
        env_seed += 1
        done, episode_reward = False, 0

        while not done:
            a, logprob_a = agent.select_action(s, deterministic=False)
            s_next, r, dw, tr, info = env.step(a)
            done = dw or tr
            episode_reward += r

            if done: train_rewards.append((total_steps, episode_reward))

            agent.put_data(s, a, r, s_next, logprob_a, done, dw, idx=traj_lenth)
            s = s_next
            traj_lenth += 1
            total_steps += 1

            if traj_lenth % opt.T_horizon == 0:
                agent.train()
                traj_lenth = 0

            if total_steps % opt.eval_interval == 0:
                score = evaluate_policy(eval_env, agent, turns=3)
                eval_rewards.append(score)
                print(f'Ep: {total_steps // opt.T_horizon}, TrainR: {train_rewards[-1][1]:.2f}, EvalR: {score:.2f}, Steps: {total_steps}, Entr: {agent.entropy_coef:.3f}')

            if total_steps % opt.save_interval == 0:
                print(f'Salvataggio modello al passo {total_steps}...')
                agent.save(total_steps // opt.T_horizon, train_rewards, eval_rewards)

    env.close()
    eval_env.close()

if __name__ == '__main__':
    main()

{'dvc': device(type='cuda'), 'EnvIdex': 0, 'write': True, 'render': False, 'Loadmodel': False, 'ModelIdex': 0, 'seed': 42, 'T_horizon': 2048, 'Max_train_steps': 500000.0, 'save_interval': 10000, 'eval_interval': 2000, 'gamma': 0.99, 'lambd': 0.95, 'clip_rate': 0.2, 'K_epochs': 10, 'net_width': 512, 'lr': 0.00025, 'l2_reg': 0.0, 'batch_size': 256, 'initial_entropy_coef': 0.001, 'min_entropy_coef': 0.001, 'entropy_coef_decay': 200000, 'adv_normalization': True}
Env: CarRacing-v3, StateDim: (84, 84), ActionDim: 5, Seed: 42
Ep: 0, TrainR: -63.70, EvalR: -93.31, Steps: 2000, Entr: 0.001


ValueError: Expected parameter probs (Tensor of shape (256, 5)) of distribution Categorical(probs: torch.Size([256, 5])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        ...,
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan]], device='cuda:0', grad_fn=<DivBackward0>)