In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3 import PPO

from gym.wrappers import GrayScaleObservation

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
import numpy as np
import os
from stable_baselines3.common.callbacks import BaseCallback

import optuna

from stable_baselines3.common.evaluation import evaluate_policy

import os

np.seterr(over='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_float('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_float('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.99)
    }
 

In [3]:
def optimize_agent(trial):
    
    try:
        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = JoypadSpace(env, SIMPLE_MOVEMENT)

        log_dir = './log_dir2/'
        os.makedirs(log_dir, exist_ok=True)

        env = Monitor(env, log_dir)

        env = GrayScaleObservation(env,keep_dim=True)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env,4,channels_order='last')

    
        model_params = optimize_ppo(trial) 
    

        tensorboard_log = r'./logs/'
        model = PPO("CnnPolicy", env, verbose=0,tensorboard_log=tensorboard_log,**model_params)
        model.learn(total_timesteps=20000)
    
        mean_reward, _ = evaluate_policy(model, env,n_eval_episodes=5)
    
    
        env.close()
    
        OPT_DIR  = r'./optuna_model/'
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_model'.format(trial.number))
        model.save(SAVE_PATH)
    
        return mean_reward    

    except Exception as e:
        print(e)
        return -1000


In [4]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=15)

[32m[I 2023-05-01 04:01:16,250][0m A new study created in memory with name: no-name-230999c6-4101-4633-8df9-584945aa77b6[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8147 and n_envs=1)
[32m[I 2023-05-01 04:40:59,416][0m Trial 0 finished with value: 339.0 and parameters: {'n_steps': 8147, 'gamma': 0.872155364861793, 'learning_rate': 8.043634236309485e-05, 'clip_range': 0.2511295705204688, 'gae_lambda': 0.8821735723563577}. Best is trial 0 with value: 339.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8077 and n_envs=1)
[32m[I 2023-05-01 05:19:58,729][0m Trial 1 finished with value: 680.0 and parameters: {'n_steps': 8077, 'gamma': 0.8243481906657717, 'learning_rate': 2.487400548685807e-05, 'clip_range': 0.23854929028781088, 'gae_lambda': 0.8062800298047543}. Best is trial 1 with value: 680.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=26

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3261 and n_envs=1)
[32m[I 2023-05-01 20:42:19,444][0m Trial 12 finished with value: 402.0 and parameters: {'n_steps': 3261, 'gamma': 0.840455367604498, 'learning_rate': 6.350820438706596e-05, 'clip_range': 0.3232337540093627, 'gae_lambda': 0.8423259048916356}. Best is trial 3 with value: 741.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2966 and n_envs=1)
[32m[I 2023-05-01 21:34:06,274][0m Trial 13 finished with value: 742.0 and parameters: {'n_steps': 2966, 'gamma': 0.8028237431779085, 'learning_rate': 6.23676101553937e-05, 'clip_range': 0.18300936299542525, 'gae_lambda': 0.8375544225396828}. Best is trial 13 with value: 742.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4879 and n_envs=1)
[32m[I 2023-05-01 22:15:43,788][0m Trial 14 finished with value: 720.0 and parameters: {'n_steps': 4879, '

In [5]:
dir(study)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_ask',
 '_directions',
 '_is_multi_objective',
 '_log_completed_trial',
 '_pop_waiting_trial_id',
 '_should_skip_enqueue',
 '_stop_flag',
 '_storage',
 '_study_id',
 '_tell',
 '_thread_local',
 'add_trial',
 'add_trials',
 'ask',
 'best_params',
 'best_trial',
 'best_trials',
 'best_value',
 'direction',
 'directions',
 'enqueue_trial',
 'get_trials',
 'optimize',
 'pruner',
 'sampler',
 'set_system_attr',
 'set_user_attr',
 'stop',
 'study_name',
 'system_attrs',
 'tell',
 'trials',
 'trials_dataframe',
 'user_attrs']

In [6]:
study.best_params

{'n_steps': 2966,
 'gamma': 0.8028237431779085,
 'learning_rate': 6.23676101553937e-05,
 'clip_range': 0.18300936299542525,
 'gae_lambda': 0.8375544225396828}

In [7]:
study.best_trial

FrozenTrial(number=13, state=TrialState.COMPLETE, values=[742.0], datetime_start=datetime.datetime(2023, 5, 1, 20, 42, 19, 446491), datetime_complete=datetime.datetime(2023, 5, 1, 21, 34, 6, 273542), params={'n_steps': 2966, 'gamma': 0.8028237431779085, 'learning_rate': 6.23676101553937e-05, 'clip_range': 0.18300936299542525, 'gae_lambda': 0.8375544225396828}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_steps': IntDistribution(high=8192, log=False, low=2048, step=1), 'gamma': FloatDistribution(high=0.9999, log=False, low=0.8, step=None), 'learning_rate': FloatDistribution(high=0.0001, log=False, low=1e-05, step=None), 'clip_range': FloatDistribution(high=0.4, log=False, low=0.1, step=None), 'gae_lambda': FloatDistribution(high=0.99, log=False, low=0.8, step=None)}, trial_id=13, value=None)