# Imports


In [1]:
import optuna 
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.callbacks import BaseCallback
import os 
import time
import matplotlib.pyplot as plt
import numpy as np
from utils.balanced_env import BalancedStreetFighterEnv
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

# Optuna HPT
## Optuna Config


In [2]:
LOG_DIR = './A2C/logs/'
OPT_DIR = './A2C/opt/'
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(OPT_DIR, exist_ok=True)

In [3]:
def optimize_a2c(trial):
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192),
        'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3), 
        'ent_coef': trial.suggest_loguniform('ent_coef', 0.00001, 0.1),
        'vf_coef': trial.suggest_uniform('vf_coef', 0.5, 1.0),
        'max_grad_norm': trial.suggest_uniform('max_grad_norm', 0.3, 1.0),
        'rms_prop_eps': trial.suggest_loguniform('rms_prop_eps', 1e-5, 1e-3)
    }


In [4]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

In [5]:
def optimize_agent(trial):
    try:
        model_params = optimize_a2c(trial)

        # Create environment 
        env = BalancedStreetFighterEnv(state='guile')
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Create algo 
        model = A2C('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)
        model.learn(total_timesteps=3000)

        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        print(e)
        return -1000

## Optuna Implementation
### Crear estudio


In [6]:
study_name = 'a2c_street_fighter'  # Nombre del estudio
storage_name = f'sqlite:///{OPT_DIR}/{study_name}.db'
study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True, direction='maximize')
study.optimize(optimize_agent, n_trials=4, n_jobs=1)

[I 2024-07-29 13:56:05,606] Using an existing study with name 'a2c_street_fighter' instead of creating a new one.
  'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
  'ent_coef': trial.suggest_loguniform('ent_coef', 0.00001, 0.1),
  'vf_coef': trial.suggest_uniform('vf_coef', 0.5, 1.0),
  'max_grad_norm': trial.suggest_uniform('max_grad_norm', 0.3, 1.0),
  'rms_prop_eps': trial.suggest_loguniform('rms_prop_eps', 1e-5, 1e-3)


Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to ./A2C/logs/A2C_2


[I 2024-07-29 13:58:54,647] Trial 1 finished with value: -5141.270876 and parameters: {'n_steps': 5043, 'gamma': 0.9534976413311825, 'learning_rate': 5.20678472087529e-05, 'ent_coef': 2.47676629261965e-05, 'vf_coef': 0.7249132805843532, 'max_grad_norm': 0.6683098974487623, 'rms_prop_eps': 3.0085513878310358e-05}. Best is trial 1 with value: -5141.270876.
  'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
  'ent_coef': trial.suggest_loguniform('ent_coef', 0.00001, 0.1),
  'vf_coef': trial.suggest_uniform('vf_coef', 0.5, 1.0),
  'max_grad_norm': trial.suggest_uniform('max_grad_norm', 0.3, 1.0),
  'rms_prop_eps': trial.suggest_loguniform('rms_prop_eps', 1e-5, 1e-3)


Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to ./A2C/logs/A2C_3


[I 2024-07-29 14:01:32,662] Trial 2 finished with value: -1916.97676 and parameters: {'n_steps': 6521, 'gamma': 0.9404101765628996, 'learning_rate': 3.832870425425772e-05, 'ent_coef': 2.62449062560913e-05, 'vf_coef': 0.959228009444008, 'max_grad_norm': 0.9040598330542258, 'rms_prop_eps': 0.0008500267190408886}. Best is trial 2 with value: -1916.97676.


Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to ./A2C/logs/A2C_4


[I 2024-07-29 14:02:36,546] Trial 3 finished with value: -1196.51843 and parameters: {'n_steps': 2295, 'gamma': 0.8472993984378927, 'learning_rate': 2.866412937949563e-05, 'ent_coef': 0.015274962503466338, 'vf_coef': 0.8300803063158387, 'max_grad_norm': 0.8654423309452886, 'rms_prop_eps': 1.5143390427306904e-05}. Best is trial 3 with value: -1196.51843.


Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to ./A2C/logs/A2C_5


[W 2024-07-29 14:02:44,670] Trial 4 failed with parameters: {'n_steps': 3949, 'gamma': 0.8437190808909862, 'learning_rate': 1.742306966441677e-05, 'ent_coef': 0.0009232607710687649, 'vf_coef': 0.7720834516172685, 'max_grad_norm': 0.8973374085212571, 'rms_prop_eps': 0.0006293105182012551} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\nicol\anaconda3\envs\stf2-enviroment\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\nicol\AppData\Local\Temp\ipykernel_21488\2707853282.py", line 13, in optimize_agent
    model.learn(total_timesteps=3000)
  File "c:\Users\nicol\anaconda3\envs\stf2-enviroment\lib\site-packages\stable_baselines3\a2c\a2c.py", line 192, in learn
    return super(A2C, self).learn(
  File "c:\Users\nicol\anaconda3\envs\stf2-enviroment\lib\site-packages\stable_baselines3\common\on_policy_algorithm.py", line 237, in learn
    continue_training = self

KeyboardInterrupt: 

#### Cargar un estudio viejo

In [None]:
study_name = 'A2C_street_fighter'  # Nombre del estudio
storage_name = f'sqlite:///{OPT_DIR}/{study_name}.db'
study = optuna.load_study(study_name=study_name, storage=storage_name)

### Obtener el mejor modelo del estudio actual

In [None]:
study.best_trial

# Entrenamiento del agente

## Configuracion del callback

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
CHECKPOINT_DIR = '/PPO2/train/'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

In [None]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

## Entrenar modelo
Crear el entorno

In [None]:
env = BalancedStreetFighterEnv()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

Configurar hyperparametros del modelo, usar 7488 o dividir los n_steps por 64

In [None]:
model_params = study.best_params
model_params['n_steps'] = 7488
model_params

Definicion del modelo 

In [None]:
model = A2C('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

Cargar transfer learning de Optuna, usar el modelo que tuvo los mejores resultados

In [None]:
model.load(os.path.join(CHECKPOINT_DIR, 'best_model_5000000.zip'))

Entrenar y guardar el modelo

In [None]:
model.learn(total_timesteps=5000000, callback=callback)
model.save(os.path.join(LOG_DIR, 'final_model'))

## Cargar un modelo ya entrenado para su posterior evaluacion

In [None]:
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import CheckpointCallback
import os
import time
import matplotlib.pyplot as plt
import numpy as np
from utils.balanced_env import BalancedStreetFighterEnv


env = BalancedStreetFighterEnv()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

study_name = 'ppo_street_fighter'
storage_name = f'sqlite:///{OPT_DIR}/{study_name}.db'
study = optuna.load_study(study_name=study_name, storage=storage_name)

model_params = study.best_params
model_params['n_steps'] = 7488

model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

model.load(os.path.join(CHECKPOINT_DIR, 'final_model.zip'))