In [None]:
%pip install stable-baselines3[extra]

In [None]:
from stable_baselines3.common.env_checker import check_env
from games.tileman.envs.solo_player_env import SoloPlayerEnv

env = SoloPlayerEnv()
check_env(env, warn=True)
env.close()

In [None]:
from games.tileman.envs.solo_player_env import SoloPlayerEnv
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
# Train the agent
model = A2C("MlpPolicy", vec_env, verbose=1)

In [None]:
import numpy as np
import games.tileman.envs.solo_player_env
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy

from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env

rng = np.random.default_rng(0)
env = make_vec_env(
    "tileman-solo-v0",
    n_envs=1,
    rng=rng,
    post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # for computing rollouts
)

def train_expert():
    expert = PPO(
        policy=MlpPolicy,
        env=env,
        seed=0,
        batch_size=64,
        ent_coef=0.0,
        learning_rate=0.0003,
        n_epochs=10,
        n_steps=64,
    )
    expert.learn(100_000)  # Note: change this to 100_000 to train a decent expert.
    return expert

In [None]:
def sample_expert_transitions():
    expert = train_expert()

    rollouts = rollout.rollout(
        expert,
        env,
        rollout.make_sample_until(min_timesteps=None, min_episodes=50),
        rng=rng,
    )
    return rollout.flatten_trajectories(rollouts)

transitions = sample_expert_transitions()
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)

In [None]:
evaluation_env = make_vec_env(
    "tileman-solo-v0",
    rng=rng,
    env_make_kwargs={"render_mode": "human"},  # for rendering
)

print("Evaluating the untrained policy.")
reward, _ = evaluate_policy(
    bc_trainer.policy,  # type: ignore[arg-type]
    evaluation_env,
    n_eval_episodes=3,
    render=False,  # comment out to speed up
)
print(f"Reward before training: {reward}")

print("Training a policy using Behavior Cloning")
bc_trainer.train(n_epochs=10)

print("Evaluating the trained policy.")
reward, _ = evaluate_policy(
    bc_trainer.policy,  # type: ignore[arg-type]
    evaluation_env,
    n_eval_episodes=3,
    render=False,  # comment out to speed up
)
print(f"Reward after training: {reward}")

In [None]:
import pygame
from stable_baselines3.common.env_util import make_vec_env
from games.tileman.envs.solo_player_env import SoloPlayerEnv
from games.tileman.envs.solo_player_env import SoloPlayerEnv
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
# Train the agent

vec_env = make_vec_env(SoloPlayerEnv, n_envs=1, env_kwargs=dict(grid_size=40, vision_range=5, max_steps=300))
model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=10_00, progress_bar=True)
obs = vec_env.reset()

def run_eval():
    rewards = 0
    obs = vec_env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_env.step(action)
        vec_env.render()
        
        rewards += reward
        
        if done:
            print(f"Episode finished after {1} timesteps, total rewards: {rewards}")
            rewards = 0
            obs = vec_env.reset()

#while True:
#    run_eval()

pygame 2.6.1 (SDL 2.28.4, Python 3.10.11)
Hello from the pygame community. https://www.pygame.org/contribute.html
Using cpu device


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 36       |
|    iterations      | 1        |
|    time_elapsed    | 55       |
|    total_timesteps | 2048     |
---------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 35          |
|    iterations           | 2           |
|    time_elapsed         | 115         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010790299 |
|    clip_fraction        | 0.0561      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | nan         |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0101     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0102     |
|    value_loss           | 0.00076     |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 34          |
|    iterations           | 3           |
|    time_elapsed         | 175         |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.006825514 |
|    clip_fraction        | 0.03        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | nan         |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00924    |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.00852    |
|    value_loss           | 7.62e-05    |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 34          |
|    iterations           | 4           |
|    time_elapsed         | 235         |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.009533215 |
|    clip_fraction        | 0.0572      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.35       |
|    explained_variance   | nan         |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0446      |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.00855    |
|    value_loss           | 2.29e-05    |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 34          |
|    iterations           | 5           |
|    time_elapsed         | 295         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.006582068 |
|    clip_fraction        | 0.0369      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.34       |
|    explained_variance   | nan         |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0132     |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0111     |
|    value_loss           | 1.68e-05    |
-----------------------------------------


In [5]:
from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(
    model.policy,  # type: ignore[arg-type]
    vec_env,
    n_eval_episodes=3,
    render=False,  # comment out to speed up
)
print(f"Rewar: {reward}")

vec_env.close()

Rewar: 0.0
