In [None]:
%pip install stable-baselines3[extra]

In [None]:
from stable_baselines3.common.env_checker import check_env
from games.tileman.envs.solo_player_env import SoloPlayerEnv

env = SoloPlayerEnv()
check_env(env, warn=True)
env.close()

In [None]:
from games.tileman.envs.solo_player_env import SoloPlayerEnv
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
# Train the agent
model = A2C("MlpPolicy", vec_env, verbose=1)

In [None]:
import numpy as np
import games.tileman.envs.solo_player_env
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy

from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env

rng = np.random.default_rng(0)
env = make_vec_env(
    "tileman-solo-v0",
    n_envs=1,
    rng=rng,
    post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # for computing rollouts
)

def train_expert():
    expert = PPO(
        policy=MlpPolicy,
        env=env,
        seed=0,
        batch_size=64,
        ent_coef=0.0,
        learning_rate=0.0003,
        n_epochs=10,
        n_steps=64,
    )
    expert.learn(100_000)  # Note: change this to 100_000 to train a decent expert.
    return expert

In [None]:
def sample_expert_transitions():
    expert = train_expert()

    rollouts = rollout.rollout(
        expert,
        env,
        rollout.make_sample_until(min_timesteps=None, min_episodes=50),
        rng=rng,
    )
    return rollout.flatten_trajectories(rollouts)

transitions = sample_expert_transitions()
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)

In [None]:
evaluation_env = make_vec_env(
    "tileman-solo-v0",
    rng=rng,
    env_make_kwargs={"render_mode": "human"},  # for rendering
)

print("Evaluating the untrained policy.")
reward, _ = evaluate_policy(
    bc_trainer.policy,  # type: ignore[arg-type]
    evaluation_env,
    n_eval_episodes=3,
    render=False,  # comment out to speed up
)
print(f"Reward before training: {reward}")

print("Training a policy using Behavior Cloning")
bc_trainer.train(n_epochs=10)

print("Evaluating the trained policy.")
reward, _ = evaluate_policy(
    bc_trainer.policy,  # type: ignore[arg-type]
    evaluation_env,
    n_eval_episodes=3,
    render=False,  # comment out to speed up
)
print(f"Reward after training: {reward}")

In [None]:
!pip install -r requirements.txt

In [None]:
import pygame
from stable_baselines3.common.env_util import make_vec_env
from games.tileman.envs.solo_player_env import SoloPlayerEnv
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy

vec_env = make_vec_env("tileman-solo-v0", n_envs=10, env_kwargs=dict(grid_size=40, vision_range=5))
model = PPO("MlpPolicy", vec_env, verbose=0, n_steps=5000, batch_size=100, n_epochs=8, learning_rate=0.0002, policy_kwargs=dict(net_arch=[416, 416]))

class SaveEvalCallback(BaseCallback):
    def __init__(self, eval_freq=10000, verbose=0):
        super(SaveEvalCallback, self).__init__(verbose)
        self.eval_env = make_vec_env("tileman-solo-v0", n_envs=1, env_kwargs=dict(grid_size=40, vision_range=5))
        self.eval_freq = eval_freq
        self.best_mean_reward = -float('inf')

    def _on_step(self) -> bool:
        if self.n_calls % self.eval_freq == 0:
            reward, _ = evaluate_policy(
                model.policy,  # type: ignore[arg-type]
                vec_env,
                n_eval_episodes=3,
                render=False,  # comment out to speed up
            )
            if self.verbose > 0:
                print(f"Step: {self.n_calls}, Reward: {reward}")
            if reward > self.best_mean_reward:
                self.best_mean_reward = reward
                self.model.save(f"best_models/{self.n_calls}/{reward}")
        return True
    
model.learn(total_timesteps=100_000_000, progress_bar=True, callback=SaveEvalCallback(eval_freq=10000, verbose=1))
obs = vec_env.reset()

In [None]:
model.save("test")

In [None]:
def run_eval():
    rewards = 0
    obs = vec_env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_env.step(action)
        vec_env.render()
        
        rewards += reward
        
        if done:
            print(f"Episode finished after {1} timesteps, total rewards: {rewards}")
            rewards = 0
            obs = vec_env.reset()

while True:
    run_eval()

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(
    model.policy,  # type: ignore[arg-type]
    vec_env,
    n_eval_episodes=3,
    render=False,  # comment out to speed up
)
print(f"Rewar: {reward}")

vec_env.close()