In [6]:
# Model Param
TOTAL_TIMESTEP_NUMB = 800_000
LEARNING_RATE = 0.0001
GAE = 1.0
ENT_COEF = 0.01
N_STEPS = 512
GAMMA = 0.9
BATCH_SIZE = 64
N_EPOCHS = 10

# Test Param
EVAL_FREQ = 10000
TEST_EPISODE_NUMBERS = 20

In [7]:
from pathlib import Path

save_dir = Path('./mlp_model')

In [8]:
from model_mlp import MarioNetMLP1, MarioNetMLP2, MarioNetMLP3, MarioNetMLP4
from model_cnn import MarioNet

policy_kwargs_1 = dict(
    features_extractor_class=MarioNetMLP1,
    features_extractor_kwargs=dict(features_dim=256),
)
policy_kwargs_2 = dict(
    features_extractor_class=MarioNetMLP2,
    features_extractor_kwargs=dict(features_dim=256),
)
policy_kwargs_3 = dict(
    features_extractor_class=MarioNetMLP3,
    features_extractor_kwargs=dict(features_dim=256),
)
policy_kwargs_4 = dict(
    features_extractor_class=MarioNetMLP4,
    features_extractor_kwargs=dict(features_dim=256),
)

policy_kwargs_control = dict(
    features_extractor_class=MarioNet,
    features_extractor_kwargs=dict(features_dim=256),
)

In [9]:
from utils import make_parallel_env, STAGE_PIXEL, STAGE_RECTANGLE

env = make_parallel_env(STAGE_RECTANGLE, 8, stack=False, resize=42)
eval = make_parallel_env(STAGE_RECTANGLE, 1, stack=False, resize=42)

# dict with the different environments and names
models = {
    'MLP_1': dict(env=env, eval=eval, policy_kwargs=policy_kwargs_1),
    'MLP_2': dict(env=env, eval=eval, policy_kwargs=policy_kwargs_2),
    'MLP_3': dict(env=env, eval=eval, policy_kwargs=policy_kwargs_3),
    'MLP_4': dict(env=env, eval=eval, policy_kwargs=policy_kwargs_4),
    'CNN': dict(env=env, eval=eval, policy_kwargs=policy_kwargs_control),
}

In [10]:
from utils import TrainAndLoggingCallback
from stable_baselines3 import PPO

for key in models.keys():
    print(f"Training {key}")
    model = PPO('CnnPolicy', models[key]['env'], verbose=0, policy_kwargs=models[key]['policy_kwargs'], tensorboard_log=save_dir,
                learning_rate=LEARNING_RATE, n_steps=N_STEPS, batch_size=BATCH_SIZE, n_epochs=N_EPOCHS, gamma=GAMMA,
                gae_lambda=GAE, ent_coef=ENT_COEF)
    callback = TrainAndLoggingCallback(test_env=models[key]['eval'], check_freq=EVAL_FREQ, episode_num=TEST_EPISODE_NUMBERS)

    model.learn(total_timesteps=TOTAL_TIMESTEP_NUMB, tb_log_name=key, callback=callback)



Training CNN
time steps: 80000
average reward: 781.8 average time: 200.2 best_reward: 1393.0
time steps: 160000
average reward: 1037.35 average time: 221.95 best_reward: 1408.0
time steps: 240000
average reward: 851.2 average time: 178.85 best_reward: 1314.0
time steps: 320000
average reward: 1145.7 average time: 226.65 best_reward: 2327.0
time steps: 400000
average reward: 1507.35 average time: 287.1 best_reward: 2603.0
time steps: 480000
average reward: 1647.6 average time: 334.15 best_reward: 3008.0
time steps: 560000
average reward: 1830.95 average time: 347.3 best_reward: 3012.0
time steps: 640000
average reward: 1528.75 average time: 296.15 best_reward: 3010.0
time steps: 720000
average reward: 1405.9 average time: 272.4 best_reward: 2592.0
time steps: 800000
average reward: 1426.9 average time: 269.25 best_reward: 2340.0
