In [3]:
import retro
import os
import cv2
import numpy as np
import optuna
from gymnasium import Env
from gymnasium.spaces import MultiBinary, Box
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback  
import time

In [28]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(
            game="StreetFighterIISpecialChampionEdition-Genesis",
            use_restricted_actions=retro.Actions.FILTERED
        )
        self.previous_frame = np.zeros((84, 84, 1), dtype=np.uint8)
        self.score = 0

    def preprocess(self, observation):
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        return np.reshape(resized, (84, 84, 1))

    def reset(self, **kwargs):
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        self.score = 0
        return obs, {}

    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        
        reward = info.get('score', 0) - self.score
        self.score = info.get('score', 0)
        
        terminated = done
        truncated = False
        return frame_delta, reward, terminated, truncated, info

    def render(self, *args, **kwargs):
        self.game.render(*args, **kwargs)

    def close(self):
        self.game.close()

In [29]:
LOG_DIR = "./logs/"
OPT_DIR = "./opt/"
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(OPT_DIR, exist_ok=True)

In [9]:
def optimize_ppo(trial):
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192),
        'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_uniform('gae_lambda', 0.8, 0.99),
         
    }

def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial)
        
        print(f"\nStarting Trial {trial.number}")
        print("Parameters:")
        for key, value in model_params.items():
            print(f"  {key}: {value}")

        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, n_stack=4, channels_order='last')

        model = PPO(
            'CnnPolicy',
            env,
            verbose=0,
            tensorboard_log=LOG_DIR,
            **model_params
        )

        model.learn(total_timesteps=100000)

        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, f"trial_{trial.number}_best_model")
        model.save(SAVE_PATH)

        print(f"\nTrial {trial.number} finished")
        print(f"Mean reward: {mean_reward}")
        print("Parameters:")
        for key, value in model_params.items():
            print(f"  {key}: {value}")

        return mean_reward

    except Exception as e:
        print(f"\nTrial failed due to: {e}")
        return -1000

def print_best_trial(study, trial):
    print(f"\nCurrent best trial: {study.best_trial.number}")
    print(f"Best value: {study.best_trial.value}")
    print("Best params:")
    for key, value in study.best_trial.params.items():
        print(f"  {key}: {value}")

# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(
    optimize_agent, 
    n_trials=1000, 
    n_jobs=1,
    callbacks=[print_best_trial]
)

# Save the best model after all trials
if study.best_trial:
    best_model_path = os.path.join(OPT_DIR, "best_model")
    best_model = PPO.load(os.path.join(OPT_DIR, f"trial_{study.best_trial.number}_best_model"))
    best_model.save(best_model_path)
    print(f"\nSaved best model from trial {study.best_trial.number} to {best_model_path}")

[I 2025-04-08 14:23:52,264] A new study created in memory with name: no-name-2854ce20-1257-40c0-a632-f1b89490df18
  'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
  'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.4),
  'gae_lambda': trial.suggest_uniform('gae_lambda', 0.8, 0.99),



Starting Trial 0
Parameters:
  n_steps: 2234
  gamma: 0.9678535745643071
  learning_rate: 1.5818461356559774e-05
  clip_range: 0.183758152402001
  gae_lambda: 0.8889790303772501


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2234 and n_envs=1)
[I 2025-04-08 14:34:30,492] Trial 0 finished with value: 5000.0 and parameters: {'n_steps': 2234, 'gamma': 0.9678535745643071, 'learning_rate': 1.5818461356559774e-05, 'clip_range': 0.183758152402001, 'gae_lambda': 0.8889790303772501}. Best is trial 0 with value: 5000.0.



Trial 0 finished
Mean reward: 5000.0
Parameters:
  n_steps: 2234
  gamma: 0.9678535745643071
  learning_rate: 1.5818461356559774e-05
  clip_range: 0.183758152402001
  gae_lambda: 0.8889790303772501

Current best trial: 0
Best value: 5000.0
Best params:
  n_steps: 2234
  gamma: 0.9678535745643071
  learning_rate: 1.5818461356559774e-05
  clip_range: 0.183758152402001
  gae_lambda: 0.8889790303772501

Starting Trial 1
Parameters:
  n_steps: 6808
  gamma: 0.9548798932936535
  learning_rate: 1.9484615633222276e-05
  clip_range: 0.32094729568276137
  gae_lambda: 0.8498948429129224


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6808 and n_envs=1)
[I 2025-04-08 14:46:36,150] Trial 1 finished with value: 1000.0 and parameters: {'n_steps': 6808, 'gamma': 0.9548798932936535, 'learning_rate': 1.9484615633222276e-05, 'clip_range': 0.32094729568276137, 'gae_lambda': 0.8498948429129224}. Best is trial 0 with value: 5000.0.



Trial 1 finished
Mean reward: 1000.0
Parameters:
  n_steps: 6808
  gamma: 0.9548798932936535
  learning_rate: 1.9484615633222276e-05
  clip_range: 0.32094729568276137
  gae_lambda: 0.8498948429129224

Current best trial: 0
Best value: 5000.0
Best params:
  n_steps: 2234
  gamma: 0.9678535745643071
  learning_rate: 1.5818461356559774e-05
  clip_range: 0.183758152402001
  gae_lambda: 0.8889790303772501

Starting Trial 2
Parameters:
  n_steps: 3655
  gamma: 0.8310346527480125
  learning_rate: 6.381163133597985e-05
  clip_range: 0.3056094450063675
  gae_lambda: 0.9885261965946768


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3655 and n_envs=1)
[I 2025-04-08 14:58:43,909] Trial 2 finished with value: 3000.0 and parameters: {'n_steps': 3655, 'gamma': 0.8310346527480125, 'learning_rate': 6.381163133597985e-05, 'clip_range': 0.3056094450063675, 'gae_lambda': 0.9885261965946768}. Best is trial 0 with value: 5000.0.



Trial 2 finished
Mean reward: 3000.0
Parameters:
  n_steps: 3655
  gamma: 0.8310346527480125
  learning_rate: 6.381163133597985e-05
  clip_range: 0.3056094450063675
  gae_lambda: 0.9885261965946768

Current best trial: 0
Best value: 5000.0
Best params:
  n_steps: 2234
  gamma: 0.9678535745643071
  learning_rate: 1.5818461356559774e-05
  clip_range: 0.183758152402001
  gae_lambda: 0.8889790303772501

Starting Trial 3
Parameters:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3795 and n_envs=1)
[I 2025-04-08 15:11:08,444] Trial 3 finished with value: 45600.0 and parameters: {'n_steps': 3795, 'gamma': 0.9003445466748287, 'learning_rate': 4.960542238487099e-05, 'clip_range': 0.23687395041796724, 'gae_lambda': 0.8192653363600547}. Best is trial 3 with value: 45600.0.



Trial 3 finished
Mean reward: 45600.0
Parameters:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547

Current best trial: 3
Best value: 45600.0
Best params:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547

Starting Trial 4
Parameters:
  n_steps: 3039
  gamma: 0.926252636063524
  learning_rate: 2.3992577883004538e-05
  clip_range: 0.11617916941987315
  gae_lambda: 0.8257397532752823


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3039 and n_envs=1)
[I 2025-04-08 15:20:51,803] Trial 4 finished with value: 1700.0 and parameters: {'n_steps': 3039, 'gamma': 0.926252636063524, 'learning_rate': 2.3992577883004538e-05, 'clip_range': 0.11617916941987315, 'gae_lambda': 0.8257397532752823}. Best is trial 3 with value: 45600.0.



Trial 4 finished
Mean reward: 1700.0
Parameters:
  n_steps: 3039
  gamma: 0.926252636063524
  learning_rate: 2.3992577883004538e-05
  clip_range: 0.11617916941987315
  gae_lambda: 0.8257397532752823

Current best trial: 3
Best value: 45600.0
Best params:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547

Starting Trial 5
Parameters:
  n_steps: 2366
  gamma: 0.8300244971275635
  learning_rate: 1.973386902902725e-05
  clip_range: 0.20785129729086949
  gae_lambda: 0.9575934732899748


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2366 and n_envs=1)
[I 2025-04-08 15:31:40,840] Trial 5 finished with value: 2000.0 and parameters: {'n_steps': 2366, 'gamma': 0.8300244971275635, 'learning_rate': 1.973386902902725e-05, 'clip_range': 0.20785129729086949, 'gae_lambda': 0.9575934732899748}. Best is trial 3 with value: 45600.0.



Trial 5 finished
Mean reward: 2000.0
Parameters:
  n_steps: 2366
  gamma: 0.8300244971275635
  learning_rate: 1.973386902902725e-05
  clip_range: 0.20785129729086949
  gae_lambda: 0.9575934732899748

Current best trial: 3
Best value: 45600.0
Best params:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547

Starting Trial 6
Parameters:
  n_steps: 3345
  gamma: 0.9343268529642388
  learning_rate: 9.391889170775487e-05
  clip_range: 0.1738704305480394
  gae_lambda: 0.8827222981121964


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3345 and n_envs=1)
[I 2025-04-08 15:42:03,867] Trial 6 finished with value: 2000.0 and parameters: {'n_steps': 3345, 'gamma': 0.9343268529642388, 'learning_rate': 9.391889170775487e-05, 'clip_range': 0.1738704305480394, 'gae_lambda': 0.8827222981121964}. Best is trial 3 with value: 45600.0.



Trial 6 finished
Mean reward: 2000.0
Parameters:
  n_steps: 3345
  gamma: 0.9343268529642388
  learning_rate: 9.391889170775487e-05
  clip_range: 0.1738704305480394
  gae_lambda: 0.8827222981121964

Current best trial: 3
Best value: 45600.0
Best params:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547

Starting Trial 7
Parameters:
  n_steps: 5919
  gamma: 0.8754413897158089
  learning_rate: 3.989971379684793e-05
  clip_range: 0.12406691266492693
  gae_lambda: 0.925169234690562


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=5919 and n_envs=1)
[I 2025-04-08 15:53:14,335] Trial 7 finished with value: 1500.0 and parameters: {'n_steps': 5919, 'gamma': 0.8754413897158089, 'learning_rate': 3.989971379684793e-05, 'clip_range': 0.12406691266492693, 'gae_lambda': 0.925169234690562}. Best is trial 3 with value: 45600.0.



Trial 7 finished
Mean reward: 1500.0
Parameters:
  n_steps: 5919
  gamma: 0.8754413897158089
  learning_rate: 3.989971379684793e-05
  clip_range: 0.12406691266492693
  gae_lambda: 0.925169234690562

Current best trial: 3
Best value: 45600.0
Best params:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547

Starting Trial 8
Parameters:
  n_steps: 2504
  gamma: 0.8248434861766549
  learning_rate: 1.7645720546779833e-05
  clip_range: 0.11598136779051013
  gae_lambda: 0.8641566910901479


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2504 and n_envs=1)
[I 2025-04-08 16:05:09,195] Trial 8 finished with value: 4200.0 and parameters: {'n_steps': 2504, 'gamma': 0.8248434861766549, 'learning_rate': 1.7645720546779833e-05, 'clip_range': 0.11598136779051013, 'gae_lambda': 0.8641566910901479}. Best is trial 3 with value: 45600.0.



Trial 8 finished
Mean reward: 4200.0
Parameters:
  n_steps: 2504
  gamma: 0.8248434861766549
  learning_rate: 1.7645720546779833e-05
  clip_range: 0.11598136779051013
  gae_lambda: 0.8641566910901479

Current best trial: 3
Best value: 45600.0
Best params:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547

Starting Trial 9
Parameters:
  n_steps: 5230
  gamma: 0.8147419319162601
  learning_rate: 3.7924326880121175e-05
  clip_range: 0.2916564660522714
  gae_lambda: 0.9768708220445755


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=5230 and n_envs=1)
[I 2025-04-08 16:17:39,887] Trial 9 finished with value: 0.0 and parameters: {'n_steps': 5230, 'gamma': 0.8147419319162601, 'learning_rate': 3.7924326880121175e-05, 'clip_range': 0.2916564660522714, 'gae_lambda': 0.9768708220445755}. Best is trial 3 with value: 45600.0.



Trial 9 finished
Mean reward: 0.0
Parameters:
  n_steps: 5230
  gamma: 0.8147419319162601
  learning_rate: 3.7924326880121175e-05
  clip_range: 0.2916564660522714
  gae_lambda: 0.9768708220445755

Current best trial: 3
Best value: 45600.0
Best params:
  n_steps: 3795
  gamma: 0.9003445466748287
  learning_rate: 4.960542238487099e-05
  clip_range: 0.23687395041796724
  gae_lambda: 0.8192653363600547

Saved best model from trial 3 to ./opt/best_model


In [10]:
study.best_trial

FrozenTrial(number=3, state=TrialState.COMPLETE, values=[45600.0], datetime_start=datetime.datetime(2025, 4, 8, 14, 58, 43, 924346), datetime_complete=datetime.datetime(2025, 4, 8, 15, 11, 8, 440268), params={'n_steps': 3795, 'gamma': 0.9003445466748287, 'learning_rate': 4.960542238487099e-05, 'clip_range': 0.23687395041796724, 'gae_lambda': 0.8192653363600547}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_steps': IntDistribution(high=8192, log=False, low=2048, step=1), 'gamma': FloatDistribution(high=0.9999, log=True, low=0.8, step=None), 'learning_rate': FloatDistribution(high=0.0001, log=True, low=1e-05, step=None), 'clip_range': FloatDistribution(high=0.4, log=False, low=0.1, step=None), 'gae_lambda': FloatDistribution(high=0.99, log=False, low=0.8, step=None)}, trial_id=3, value=None)

In [6]:
model = PPO.load(os.path.join(OPT_DIR, 'trial_3_best_model.zip'))

In [7]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self,check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def __init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok = True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [8]:
CHECKPOINT_DIR = './train/'

In [15]:
model_params = params={'n_steps': 3776, 'gamma': 0.9003445466748287, 'learning_rate': 4.960542238487099e-05, 'clip_range': 0.23687395041796724, 'gae_lambda': 0.8192653363600547}

In [30]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4, channels_order='last')

In [17]:
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, **model_params)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [18]:
model.load(os.path.join(OPT_DIR, 'trial_3_best_model.zip'))

<stable_baselines3.ppo.ppo.PPO at 0x24a1dc116a0>

In [20]:
model.learn(total_timesteps=100000, callback = callback)

Logging to ./logs/PPO_13
-----------------------------
| time/              |      |
|    fps             | 206  |
|    iterations      | 1    |
|    time_elapsed    | 18   |
|    total_timesteps | 3776 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 180       |
|    iterations           | 2         |
|    time_elapsed         | 41        |
|    total_timesteps      | 7552      |
| train/                  |           |
|    approx_kl            | 0.4922871 |
|    clip_fraction        | 0.546     |
|    clip_range           | 0.237     |
|    entropy_loss         | -6.36     |
|    explained_variance   | 0.345     |
|    learning_rate        | 4.96e-05  |
|    loss                 | 324       |
|    n_updates            | 230       |
|    policy_gradient_loss | 0.0529    |
|    value_loss           | 4.82e+04  |
---------------------------------------
----------------------------------------
| ro

<stable_baselines3.ppo.ppo.PPO at 0x24a7c64a670>

In [32]:
model = PPO.load('./opt/trial_3_best_model.zip')

In [25]:
env.close()

In [33]:
mean_reward , _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)

In [34]:
mean_reward


45600.0

In [35]:
obs = env.reset()

In [47]:
# Close existing environment
env.close()

# Recreate the environment
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4, channels_order='last')

# Start fresh
obs = env.reset()
done = False
for game in range(1):
    while not done:
        # For DummyVecEnv, access the underlying environment
        env.envs[0].render()
        
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        print(reward)


[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]


KeyboardInterrupt: 