In [None]:
from pettingzoo.classic import connect_four_v3
import ray.rllib.algorithms.ppo as ppo
from ray.rllib.algorithms.pg import (
    PG,
    PGConfig,
)
import ray
from ray import tune, air
from ray.tune import CLIReporter, register_env
from ray.rllib.policy.policy import Policy
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.utils.framework import try_import_torch
from bnbot.wrappers.connect4wrapper import Connect4Env
from bnbot.models.connect4model import Connect4MaskModel
from bnbot.models.connect4model import SacConnect4MaskModel
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from ray.rllib.policy.policy import PolicySpec
import imageio
import numpy as np
import sys
from ray.rllib.algorithms import sac
import random
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from bnbot.policies.dummy_policy import AlwaysSameHeuristic, BeatLastHeuristic, RandomHeuristic

torch, nn = try_import_torch()

In [None]:
ray.shutdown()
# ray.init()

In [None]:
# define how to make the environment. This way takes an optional environment config, num_floors
env_creator = lambda config: connect_four_v3.env(render_mode="rgb_array")

# register that way to make the environment under an rllib name
register_env("connect4", lambda config: Connect4Env(env_creator(config)))

In [None]:
def create_add_policy_cb(policy_id, checkpoint):
    class AddPolicyCallback(DefaultCallbacks):
        def __init__(self):
            super().__init__()

        def on_algorithm_init(self, *, algorithm, **kwargs):
            policy = Policy.from_checkpoint(checkpoint)

            # Add restored policy to trainer.
            # Note that this policy doesn't have to be trained with the same algorithm
            # of the training stack. You can even mix up TF policies with a Torch stack.
            algorithm.add_policy(
                policy_id="opponent",
                policy=policy[policy_id],
                evaluation_workers=True,
            )
    return AddPolicyCallback


def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    # main policy plays against opponent policy.
    return "main" if episode.episode_id % 2 == int(agent_id[-1:]) else "opponent"
 

In [None]:
def ask_user_for_action(obs, player_id):
    """
    Asks the user for a valid action on the command line and returns it.
    """
    legal_moves = obs[player_id]["action_mask"]
    legal_moves = np.arange(7)[legal_moves == 1]

    choice = -1
    while choice not in legal_moves:
        # print("Choose an action from {}:".format(legal_moves))
        sys.stdout.flush()
        choice_str = input()
        try:
            choice = int(choice_str)
        except ValueError:
            continue
    return choice

# PPO Algorithm

In [None]:
config = (
    ppo.PPOConfig()
    .environment("connect4")
    .framework("torch")
    .training(model={"custom_model": Connect4MaskModel})
    .multi_agent(
        policies={"policy_0", "policy_1"},
        policy_mapping_fn=(
            lambda agent_id, episode, worker, **kw: (
                "policy_0" if agent_id == "player_0" else "policy_1"
            )
        ),
        policies_to_train=[
            "policy_0",
            "policy_1",
        ],
    )
)

stop = {
    "timesteps_total": 100000,
    "training_iteration": 10,
}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

results = tuner.fit()

In [None]:
best_result = results.get_best_result()
algo = Algorithm.from_checkpoint(best_result.checkpoint)
algo.restore(best_result.checkpoint)
# Perform inference (action computations) based on given env observations

# PG Algorithm

In [None]:
config = (
    PGConfig()
    .environment("connect4")
    .framework("torch")
    .training(model={"custom_model": Connect4MaskModel})
    .multi_agent(
        policies={"policy_0", "policy_1"},
        policy_mapping_fn=(
            lambda agent_id, episode, worker, **kw: (
                "policy_0" if agent_id == "player_0" else "policy_1"
            )
        ),
        policies_to_train=[
            "policy_0",
            "policy_1",
        ],
    )
)

stop = {
    "timesteps_total": 100000,
    "training_iteration": 20,
}

results = tune.Tuner(
    "PG", 
    param_space=config, 
    run_config=air.RunConfig(stop=stop)
).fit()

# Heuristic and Random policies

In [None]:
class SelfPlayCallback(DefaultCallbacks):
    win_rate_threshold = 0.95

    def __init__(self):
        super().__init__()
        # 0=RandomPolicy
        # always_same
        # beat_last
        # 3=1st learned policy snapshot,
        # 4=2nd learned policy snapshot, etc..
        self.current_opponent = 0

    def on_train_result(self, *, algorithm, result, **kwargs):
        # Get the win rate for the train batch.
        # Note that normally, one should set up a proper evaluation config,
        # such that evaluation always happens on the already updated policy,
        # instead of on the already used train_batch.
        main_rew = result["hist_stats"].pop("policy_learned_reward")
        opponent_rew = result["hist_stats"].pop("episode_reward")
        
        if len(main_rew) != len(opponent_rew):
            raise Exception("len(main_rew) != len(opponent_rew)", len(main_rew), len(opponent_rew), result["hist_stats"].keys(), "episode len", len(opponent_rew))
        
        won = 0
        for r_main, r_opponent in zip(main_rew, opponent_rew):
            if r_main > r_opponent:
                won += 1
        win_rate = won / len(main_rew)
        
        result["win_rate"] = win_rate
        print(f"Iter={algorithm.iteration} win-rate={win_rate} -> ", end="")

        # If win rate is good -> Snapshot current policy and play against
        # it next, keeping the snapshot fixed and only improving the "learned"
        # policy.
        if win_rate > self.win_rate_threshold:            
            self.current_opponent += 1
            new_pol_id = f"learned_v{self.current_opponent}"
            print(f"Iter={algorithm.iteration} ### Adding new opponent to the mix ({new_pol_id}).")

            # Re-define the mapping function, such that "learned" is forced
            # to play against any of the previously played policies
            # (excluding "random").
            def policy_mapping_fn(agent_id, episode, worker, **kwargs):
                # agent_id = [0|1] -> policy depends on episode ID
                # This way, we make sure that both policies sometimes play
                # (start player) and sometimes agent1 (player to move 2nd).
                return (
                    "learned"
                    if episode.episode_id % 2 == int(agent_id[-1:])
                    else random.choice(["always_same", "beat_last", "random"] + [f"learned_v{i}" for i in range(1, self.current_opponent + 1)])
                )

            new_policy = algorithm.add_policy(
                policy_id=new_pol_id,
                policy_cls=type(algorithm.get_policy("learned")),
                policy_mapping_fn=policy_mapping_fn,
            )

            # Set the weights of the new policy to the learned policy.
            # We'll keep training the learned policy, whereas `new_pol_id` will
            # remain fixed.
            learned_state = algorithm.get_policy("learned").get_state()
            new_policy.set_state(learned_state)
            # We need to sync the just copied local weights (from learned policy)
            # to all the remote workers as well.
            algorithm.workers.sync_weights()
        else:
            print("not good enough; will keep learning ...")

        # +4 = learned + random + ...
        result["league_size"] = self.current_opponent + 4


In [None]:
def select_policy(agent_id, episode, **kwargs):
    if episode.episode_id % 2 == int(agent_id[-1:]):
        return "learned"
    else:
        return random.choice(["always_same", "beat_last", "random"])

config = (
    ppo.PPOConfig()
    .environment("connect4")
    .framework("torch")
    .training(model={"custom_model": Connect4MaskModel})
    .callbacks(SelfPlayCallback)
    .rollouts(
        num_rollout_workers=0,
        num_envs_per_worker=4,
    )
    .multi_agent(
        policies={
            "learned": PolicySpec(),
            "always_same": PolicySpec(policy_class=AlwaysSameHeuristic),
            "beat_last": PolicySpec(policy_class=BeatLastHeuristic),
            "random": PolicySpec(policy_class=RandomHeuristic),
        },
        policy_mapping_fn=select_policy,
        policies_to_train=["learned"],
    )
    .resources(num_gpus=1, num_cpus_per_worker=6)
)

stop = {
    "timesteps_total": 10000000,
    "training_iteration": 200,
}

results = tune.Tuner(
    "PPO",
    param_space=config,
    run_config=air.RunConfig(
        stop=stop,
        verbose=2,
        progress_reporter=CLIReporter(
            metric_columns={
                "training_iteration": "iter",
                "time_total_s": "time_total_s",
                "timesteps_total": "ts",
                "episodes_this_iter": "train_episodes",
                "policy_reward_mean/learned": "reward",
                "win_rate": "win_rate",
                "league_size": "league_size",
            },
            sort_by_metric=True,
        ),
        checkpoint_config=air.CheckpointConfig(
            checkpoint_at_end=True,
            checkpoint_frequency=10,
        ),
    ),
).fit()

In [None]:
best_result = results.get_best_result()
best_result.checkpoint

# Make a video record

In [None]:
env = connect_four_v3.env(render_mode="rgb_array")
env = Connect4Env(env)

done = False
obs, info = env.reset()

In [None]:
filename = "testtesttest.mp4"
with imageio.get_writer(filename, fps=30) as video:
    while not done:
        player_id = list(obs.keys())[0]
        action = algo.compute_single_action(obs[player_id], policy_id="policy_0" if player_id == "player_0" else "policy_1")
        
        if action not in np.arange(7)[obs[player_id]["action_mask"] == 1]:
            action = np.random.choice(7)

        player_actions = {player_id: action}
        
        obs, rew, terminated, truncated, info = env.step(player_actions)
        done = terminated["__all__"] or truncated["__all__"]
        video.append_data(env.render())

In [None]:
AddPolicyCallback = create_add_policy_cb("learned_v5", results.get_best_result().checkpoint)

config = (
    ppo.PPOConfig()
    .environment("connect4")
    .framework("torch")
    .callbacks(AddPolicyCallback)
    .training(model={"custom_model": Connect4MaskModel})
    .multi_agent(
        # Initial policy map: Random and PPO. This will be expanded
        # to more policy snapshots taken from "main" against which "main"
        # will then play (instead of "random"). This is done in the
        # custom callback defined above (`SelfPlayCallback`).
        # Note: We will add the "opponent" policy with callback.
        policies={"main"},  # Our main policy, we'd like to optimize.
        # Assign agent 0 and 1 randomly to the "main" policy or
        # to the opponent ("random" at first). Make sure (via episode_id)
        # that "main" always plays against "random" (and not against
        # another "main").
        policy_mapping_fn=policy_mapping_fn,
        # Always just train the "main" policy.
        policies_to_train=["main"],
    )
)
stop = {
    "timesteps_total": 10000000,
    "training_iteration": 50,
}
self_results = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(
        stop=stop,
        verbose=2,
        progress_reporter=CLIReporter(
            metric_columns={
                "training_iteration": "iter",
                "time_total_s": "time_total_s",
                "timesteps_total": "ts",
                "episodes_this_iter": "train_episodes",
                "policy_reward_mean/main": "reward",
                "win_rate": "win_rate",
                "league_size": "league_size",
            },
            sort_by_metric=True,
        ),
        checkpoint_config=air.CheckpointConfig(
            checkpoint_at_end=True,
            checkpoint_frequency=10,
        ),
    ),
).fit()

In [None]:
best_self_result = results.get_best_result()
algo = Algorithm.from_checkpoint(best_self_result.checkpoint)
algo.restore(best_self_result.checkpoint)
# Perform inference (action computations) based on given env observations

# Human vs Trained Agent

In [None]:
# Restore trained trainer (set to non-explore behavior) and play against
# human on command line.
best_result_checkpt = results.get_best_result().checkpoint
config.explore = False
algo = config.build()
algo.restore(best_result_checkpt)

In [None]:
env = connect_four_v3.env(render_mode="human")
env = Connect4Env(env)

done = False
obs, info = env.reset()

human_player = "player_1"

print("You play as {}".format("o" if human_player else "x"))

env.render()

In [None]:
while not done:
    player_id = list(obs.keys())[0]
    if player_id == human_player:
        action = ask_user_for_action(obs, player_id)
    else:
        action = algo.compute_single_action(obs[player_id], policy_id="main")
        
        legal_moves = obs[player_id]["action_mask"]
        if action not in np.arange(7)[legal_moves == 1]:
            action = np.random.choice(7)

    player_actions = {player_id: action}
    
    obs, rew, terminated, truncated, info = env.step(player_actions)
    done = terminated["__all__"] or truncated["__all__"]

print("End of game!")
if rew[human_player] > 0:
    print("You win")
elif rew[human_player] < 0:
    print("You lose")
else:
    print("Draw")

# SAC algorithm

In [None]:

from ray.rllib.algorithms.sac.sac_torch_model import SACTorchModel
from gymnasium.spaces import Dict
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils import override
from ray.rllib.utils.torch_utils import FLOAT_MIN

class SacConnect4MaskModel1(SACTorchModel):
    def __init__(
        self,
        obs_space,
        action_space,
        num_outputs,
        model_config,
        name: str,
        policy_model_config=None,
        q_model_config=None,
        twin_q=False,
        initial_alpha=1.0,
        target_entropy=None,
        **kwargs,
    ):
        orig_space = getattr(obs_space, "original_space", obs_space)

        assert isinstance(orig_space, Dict)
        assert "action_mask" in orig_space.spaces
        assert "observation" in orig_space.spaces

        super().__init__(
            obs_space,
            action_space,
            num_outputs,
            model_config,
            policy_model_config,
            q_model_config,
            twin_q,
            initial_alpha,
            target_entropy,
            **kwargs,
        )

        self.internal_model = TorchFC(
            orig_space["observation"],
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )

    @override(SACTorchModel)
    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]

        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs"]["observation"]})

        # Convert action_mask into a [0.0 || -inf]-type mask.
        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
        masked_logits = logits + inf_mask

        # Return masked logits.
        return masked_logits, state

    def value_function(self):
        return self.internal_model.value_function()


In [None]:
AddPolicyCallback = create_add_policy_cb("policy_0", best_result.checkpoint)

config = (
    sac.SACConfig()
    .environment("connect4")
    .framework("torch")
    .callbacks(AddPolicyCallback)
    .training(
        # model={
        #     "custom_model": SacConnect4MaskModel1,
        # },
        policy_model_config={
            "custom_model": SacConnect4MaskModel1,
            # "conv_filters": [[2, [6, 7], 1]],
        },
        q_model_config={
            "custom_model": SacConnect4MaskModel1,
            # "conv_filters": [[2, [6, 7], 1]],
        })
    .multi_agent(
        # Initial policy map: Random and PPO. This will be expanded
        # to more policy snapshots taken from "main" against which "main"
        # will then play (instead of "random"). This is done in the
        # custom callback defined above (`SelfPlayCallback`).
        # Note: We will add the "opponent" policy with callback.
        policies={"main"},  # Our main policy, we'd like to optimize.
        # Assign agent 0 and 1 randomly to the "main" policy or
        # to the opponent ("random" at first). Make sure (via episode_id)
        # that "main" always plays against "random" (and not against
        # another "main").
        policy_mapping_fn=policy_mapping_fn,
        # Always just train the "main" policy.
        policies_to_train=["main"],
    )
)
stop = {
    "timesteps_total": 10000000,
    "training_iteration": 500,
}
self_results = tune.Tuner(
    "SAC",
    param_space=config.to_dict(),
    run_config=air.RunConfig(
        stop=stop,
        verbose=2,
        progress_reporter=CLIReporter(
            metric_columns={
                "training_iteration": "iter",
                "time_total_s": "time_total_s",
                "timesteps_total": "ts",
                "episodes_this_iter": "train_episodes",
                "policy_reward_mean/main": "reward",
                "win_rate": "win_rate",
                "league_size": "league_size",
            },
            sort_by_metric=True,
        ),
        checkpoint_config=air.CheckpointConfig(
            checkpoint_at_end=True,
            checkpoint_frequency=10,
        ),
    ),
).fit()

# Tensorboard

In [None]:
!tensorboard --logdir /home/clem/ray_results/