In [1]:
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import SubprocVecEnv
from gymnasium.envs.registration import register

from gymnasium import spaces
import torch as th
from torch import nn
import numpy as np
from sb3_contrib.common.wrappers import ActionMasker
from sb3_contrib.ppo_mask import MaskablePPO

import gymnasium as gym
import os

from gym_data.envs import CarcassoneEnv
# device = torch.device("cuda")
register(id='Carcassone-v0',entry_point='gym_data.envs:CarcassoneEnv',) 

In [2]:
extractable = ["tile_planes", "chars_planes", "other_properties_plane"]

class CustomCombinedExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Dict):
        # We do not know features-dim here before going over all the items,
        # so put something dummy for now. PyTorch requires calling
        # nn.Module.__init__ before adding modules
        # print(observation_space)
        super().__init__(observation_space, features_dim=1)

        extractors = {}

        total_concat_size = 128
        # We need to know size of the output of this extractor,
        # so go over all the spaces and compute output feature sizes
        for key, subspace in observation_space.spaces.items():
            # print(key, subspace.shape)
            if key == "tile_planes":
                extractors[key] = nn.Sequential(nn.Conv2d(subspace.shape[0], 128, 3, stride=1, padding="same"),
                                                nn.MaxPool2d(2),
                                                nn.BatchNorm2d(128),
                                                nn.Conv2d(128, 128, 3, stride=1, padding="same"),
                                                nn.BatchNorm2d(128),
                                                nn.Conv2d(128, 256, 3, stride=1, padding="same"),
                                                nn.MaxPool2d(2),
                                                nn.BatchNorm2d(256),
                                                nn.Conv2d(256, 512, 3, stride=1, padding="same"),
                                                nn.Flatten())
                total_concat_size += 128
            elif key == "chars_planes":
                extractors[key] = nn.Sequential(nn.Conv2d(subspace.shape[0], 128, 3, stride=1, padding="same"),
                                                nn.MaxPool2d(2),
                                                nn.BatchNorm2d(128),
                                                nn.Conv2d(128, 128, 3, stride=1, padding="same"),
                                                nn.BatchNorm2d(128),
                                                nn.Conv2d(128, 256, 3, stride=1, padding="same"),
                                                nn.MaxPool2d(2),
                                                nn.BatchNorm2d(256),
                                                nn.Conv2d(256, 512, 3, stride=1, padding="same"),
                                                nn.Flatten())
                total_concat_size += 128
            elif key == "other_properties_plane":
                # print(subspace.shape)
                extractors[key] = nn.Sequential(nn.Linear(618, 512), nn.LeakyReLU(), nn.Linear(512, 256))
                # print(key)
                # summary(extractors[key], (1,) + subspace.shape)
                total_concat_size += 128


        total_concat_size = 4352
        self.extractors = nn.ModuleDict(extractors)
        print("Total concat size: ", total_concat_size)
        self._features_dim = total_concat_size

    def forward(self, observations) -> th.Tensor:
        encoded_tensor_list = []

        for key, extractor in self.extractors.items():
            # print(key, observations[key].shape)
            extractor_value = extractor(observations[key])
            encoded_tensor_list.append(extractor_value)
        concated = th.cat(encoded_tensor_list, dim=1)

        return concated



In [3]:
policy_kwargs = dict(
    features_extractor_class=CustomCombinedExtractor,
    net_arch = [1024, dict(pi=[256, 128], vf=[512, 64])]
)

def mask_fn(env: gym.Env) -> np.ndarray:
    # Do whatever you'd like in this function to return the action mask
    # for the current env. In this example, we assume the env has a
    # helpful method we can rely on.
    return env.valid_action_mask()


def make_env():
    env = CarcassoneEnv()
    env = ActionMasker(env, mask_fn)
    return env


# env = ActionMasker(env, mask_fn) 
env = make_vec_env(make_env,seed=1, n_envs=32)


In [1]:
import tensorboard


In [2]:
# from tensorboard import notebook
# notebook.list()

In [3]:
%load_ext tensorboard

In [4]:
%tensorboard --logdir ./runs  --bind_all --port 8080 

In [8]:
def linear_schedule(initial_value):
    """
    Linear learning rate schedule.
    :param initial_value: (float or str)
    :return: (function)
    """
    if isinstance(initial_value, str):
        initial_value = float(initial_value)

    def func(progress):
        """
        Progress will decrease from 1 (beginning) to 0
        :param progress: (float)
        :return: (float)
        """
        return progress * initial_value

    return func

In [10]:
iter = 0

env = make_vec_env(CarcassoneEnv, seed=1,  n_envs=32, vec_env_cls=SubprocVecEnv)

# model = A2C("MultiInputPolicy", env, verbose=1,tensorboard_log="runs/", device="auto", n_steps=5, learning_rate=linear_schedule(0.0001))
model = A2C.load("pretrained_carcassone3", env, verbose=1,tensorboard_log="runs/", device="auto", n_steps=5, learning_rate=linear_schedule(0.0001))
import time
# model.set_parameters("model_carcassone")
start = time.time()
model.learn(total_timesteps=1000_000)

end = time.time()

elapsed_time = end - start

print("Elapsed time: ", elapsed_time // 60, " minutes")

model.save("model_carcassone")
iter +=1

Total concat size:  4352
Logging to runs/A2C_5
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 101      |
|    ep_rew_mean        | -99.5    |
| time/                 |          |
|    fps                | 400      |
|    iterations         | 100      |
|    time_elapsed       | 39       |
|    total_timesteps    | 16000    |
| train/                |          |
|    entropy_loss       | -0.00124 |
|    explained_variance | 0        |
|    learning_rate      | 9.84e-05 |
|    n_updates          | 99       |
|    policy_loss        | 7.28e-13 |
|    value_loss         | 7.39     |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 101       |
|    ep_rew_mean        | -101      |
| time/                 |           |
|    fps                | 408       |
|    iterations         | 200       |
|    time_elapsed       | 78        |
|    total_timesteps

In [None]:
model.save("model_carcassone")

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)

print(f"Mean Reward: {mean_reward}")