# Imports


In [1]:
import optuna 
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.callbacks import BaseCallback
import os 
import time
import matplotlib.pyplot as plt
import numpy as np
from utils.balanced_env import BalancedStreetFighterEnv
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

# Optuna HPT
## Optuna Config


In [4]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(OPT_DIR, exist_ok=True)

In [None]:
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [None]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

In [2]:
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial) 

        # Create environment 
        env = BalancedStreetFighterEnv()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Create algo 
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        #model.learn(total_timesteps=300)
        model.learn(total_timesteps=100000)

        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        print(e)
        return -1000

## Optuna Implementation
### Crear estudio


In [None]:
study_name = 'ppo_street_fighter'  # Nombre del estudio
storage_name = f'sqlite:///{OPT_DIR}/{study_name}.db'
study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True, direction='maximize')
study.optimize(optimize_agent, n_trials=100, n_jobs=1)

#### Cargar un estudio viejo

In [None]:
study_name = 'ppo_street_fighter'  # Nombre del estudio
storage_name = f'sqlite:///{OPT_DIR}/{study_name}.db'
study = optuna.load_study(study_name=study_name, storage=storage_name)

### Obtener el mejor modelo del estudio actual

In [None]:
study.best_trial

# Entrenamiento del agente

## Configuracion del callback

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
CHECKPOINT_DIR = './train/'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

In [None]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

## Entrenar modelo
Crear el entorno

In [None]:
env = BalancedStreetFighterEnv()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

Configurar hyperparametros del modelo, usar 7488 o dividir los n_steps por 64

In [None]:
model_params = study.best_params
model_params['n_steps'] = 7488
model_params

Definicion del modelo 

In [None]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

Cargar transfer learning de Optuna, usar el modelo que tuvo los mejores resultados

In [None]:
model.load(os.path.join(CHECKPOINT_DIR, 'best_model_5000000.zip'))

Entrenar y guardar el modelo

In [None]:
model.learn(total_timesteps=5000000, callback=callback)
model.save(os.path.join(LOG_DIR, 'final_model'))

## Cargar un modelo ya entrenado para su posterior evaluacion

In [None]:
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import CheckpointCallback
import os
import time
import matplotlib.pyplot as plt
import numpy as np
from utils.balanced_env import BalancedStreetFighterEnv


env = BalancedStreetFighterEnv()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

study_name = 'ppo_street_fighter'
storage_name = f'sqlite:///{OPT_DIR}/{study_name}.db'
study = optuna.load_study(study_name=study_name, storage=storage_name)

model_params = study.best_params
model_params['n_steps'] = 7488

model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

model.load(os.path.join(CHECKPOINT_DIR, 'final_model.zip'))