In [2]:
!pip install torchviz

Collecting torchviz
  Downloading torchviz-0.0.2.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting graphviz (from torchviz)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
Building wheels for collected packages: torchviz
  Building wheel for torchviz (setup.py) ... [?25ldone
[?25h  Created wheel for torchviz: filename=torchviz-0.0.2-py3-none-any.whl size=4132 sha256=2023c62b726b085b53fb4ec2aab99670d0d50029df573977665b944910265a41
  Stored in directory: /home/diego/.cache/pip/wheels/4c/97/88/a02973217949e0db0c9f4346d154085f4725f99c4f15a87094
Successfully built torchviz
Installing collected packages: graphviz, torchviz
Successfully installed graphviz-0.20.3 torchviz-0.0.2


In [18]:
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
import numpy as np
import os
from PIL import Image
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torchviz import make_dot
import shutil

import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage, VecEnvWrapper, VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [19]:
# Print the versions of gymnasium and stable_baselines3 for debugging purposes
print(f"{gym.__version__=}")
print(f"{stable_baselines3.__version__=}")
print(torch.__version__)

gym.__version__='0.29.1'
stable_baselines3.__version__='2.4.0a7'
2.4.1+cu121


In [20]:
# Change the paths based on your directory structure
log_dir = "tensorboard3/"
models_dir = "models3/"
project_dir = ""

if os.path.exists(models_dir):
    shutil.rmtree(models_dir)
    shutil.rmtree(log_dir)
    print(f"Folder '{models_dir}' and all its contents have been deleted.")
else:
    print(f"Folder '{project_dir}' does not exist.")

os.makedirs(log_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)


Folder 'models3/' and all its contents have been deleted.


In [21]:
class ImageClassificationEnv(gym.Env):
    metadata = {"render_modes": []}

    def __init__(self, image_folder, hand):
        super(ImageClassificationEnv, self).__init__()
        self.hand = hand
        self.image_folder = image_folder
        self.image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]

        # Observation space: A 3D array representing the image (64x64x3 for RGB images)
        self.observation_space = spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)

        # Action space: Discrete actions corresponding to the classes
        if self.hand == 'left':
            self.action_space = spaces.Discrete(3)
        elif self.hand == 'right':
            self.action_space = spaces.Discrete(9)

        self.current_step = 0
        self.max_steps = 30
        self.current_image = None
        self.image_label = None

        # Initialize the random number generator
        self.np_random = None

    def load_images(self):
        # Randomly select images for the episode using the environment's RNG
        self.selected_images = self.np_random.choice(
            self.image_files, size=self.max_steps + 1, replace=False
        ).tolist()

    def reset(self, seed=None, options=None, **kwargs):
        # Handle the seed
        super().reset(seed=seed)
        if seed is not None:
            self.np_random, _ = seeding.np_random(seed)
        else:
            if self.np_random is None:
                self.np_random, _ = seeding.np_random()
        self.current_step = 0
        self.load_images()
        obs = self._next_observation()
        info = {}
        return obs, info

    def _next_observation(self):
        # Load the next image
        image_file = self.selected_images[self.current_step]
        image_path = os.path.join(self.image_folder, image_file)
        image = Image.open(image_path).resize((64, 64))
        self.current_image = np.array(image).astype(np.float64) / 255.0  # Keep as uint8

        # Load the corresponding label from the txt file
        label_file = image_file.replace('.jpg', '.txt')
        label_path = os.path.join(self.image_folder, label_file)
        with open(label_path, 'r') as file:
            label = file.read().strip()

        if self.hand == 'left':
            label_mapping = {'swarm_1': 0, 'swarm_2': 1}
            self.image_label = label_mapping.get(label, 2)  # Default to 2 for 'no action'
        elif self.hand == 'right':
            label_mapping = {
                'up': 0, 'down': 1, 'left': 2, 'right': 3, 'backwards': 4,
                'forward': 5, 'take_off': 6, 'land': 7
            }
            self.image_label = label_mapping.get(label, 8)  # Default to 8 for 'no action'

        return self.current_image

    def step(self, action):
        # Reward logic
        if action == self.image_label:
            reward = 1
        else:
            reward = -1

        self.current_step += 1
        terminated = self.current_step >= self.max_steps
        truncated = False  # Use 'truncated' if you have a time limit or similar

        obs = self._next_observation()
        info = {}
        return obs, reward, terminated, truncated, info

    def render(self):
        pass  # Rendering is not required for this environment

In [22]:
# Choose the hand to train the agent for
train_agent_for_left_hand = False
env_name = ""
if train_agent_for_left_hand:
    base_env = ImageClassificationEnv(image_folder='rl_train_dataset_left_hand', hand='left')
    env_name = 'rl_train_dataset_left_hand'
else:
    base_env = ImageClassificationEnv(image_folder='rl_train_dataset_right_hand', hand='right')
    env_name = 'rl_train_dataset_right_hand'

base_env.reset()

(array([[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]]),
 {})

In [23]:
class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        
        # Define the convolutional layers followed by activation functions (ReLU)
        self.cnn = nn.Sequential(
            nn.Conv2d(observation_space.shape[0], 32, kernel_size=8, stride=4, padding=0),  # First Conv Layer
            nn.ReLU(),  # Activation
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),  # Second Conv Layer
            nn.ReLU(),  # Activation
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),  # Third Conv Layer
            nn.ReLU(),  # Activation
            nn.Flatten()  # Flatten the output to feed into fully connected layer
        )
        
        # Automatically calculate the size after the convolutional layers
        with torch.no_grad():
            n_flatten = self.cnn(torch.as_tensor(observation_space.sample()[None]).float()).shape[1]
        
        # Define the fully connected layer with 512 output units
        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim),  # Fully connected layer
            nn.ReLU(),  # Activation
        )
    
    # Define the forward pass through the network
    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        # Pass through CNN layers and then through the fully connected layer
        return self.linear(self.cnn(observations))

In [24]:
# Define a callback class for saving models at regular intervals during training
class SaveOnIntervalCallback(BaseCallback):
    def __init__(self, save_interval: int, save_path: str, verbose=1):
        super().__init__(verbose)
        self.save_interval = save_interval
        self.save_path = save_path

    def _on_step(self) -> bool:
        # Save the model every 'save_interval' steps
        if self.num_timesteps % self.save_interval == 0:
            save_file = os.path.join(self.save_path, f'model_{self.num_timesteps}')
            self.model.save(save_file)
            if self.verbose > 0:
                print(f'Saving model to {save_file}.zip')
        return True

In [None]:
# Linear schedule from 2.5e-4 to 0
learning_rate = lambda f: f * 2.5e-4 

# Adjusted PPO hyperparameters
model = PPO(
    env=env,
    tensorboard_log=log_dir,
    policy='CnnPolicy',
    verbose=1,
    clip_range=0.1,             # Reduced clip range
    ent_coef=0.01,              # Reduced entropy coefficient
    learning_rate=learning_rate,
    n_epochs=4,                 # Reduced number of epochs
    n_steps=128,                # Reduced number of steps per update
    vf_coef=0.5,
    batch_size=256,             # Batch size divides n_steps * n_envs evenly
    device='cuda',
    policy_kwargs={'features_extractor_class': CustomCNN},
    seed=2
)

Using cuda device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=128 and n_envs=1)


In [None]:
# Define the interval at which models are saved during training
save_interval = 100000
save_callback = SaveOnIntervalCallback(save_interval, models_dir)

# Train the agent
total_timesteps = 10000000
model.learn(total_timesteps=total_timesteps, callback=save_callback, log_interval=20)

Logging to tensorboard3/PPO_1
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 30            |
|    ep_rew_mean          | -23.3         |
| time/                   |               |
|    fps                  | 262           |
|    iterations           | 20            |
|    time_elapsed         | 9             |
|    total_timesteps      | 2560          |
| train/                  |               |
|    approx_kl            | 4.9266033e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.1           |
|    entropy_loss         | -2.19         |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.00025       |
|    loss                 | 15.3          |
|    n_updates            | 76            |
|    policy_gradient_loss | -0.00101      |
|    value_loss           | 31            |
-------------------------------------------
-------------------------------------------
| 

In [None]:
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

# Save the model
model.save("final_"+env_name)