## Import Dependencies
Only need to do this once for your environment!

In [None]:
# Install retro
!pip install gym gym-retro

# We have to downgrade gym in order to preserve retro's functionality
!pip install gym==0.21.0

# A library we use to preprocess the frame for training
!pip install opencv-python

## Install the ROMs to retro
After retro is installed to your environment, you also have to add the Street Fighter ROMs to retro!
You do so by navigating to the location of your ROM in the terminal while your environment is activated
and running this command:
`python -m retro.import .`

ROM can be downloaded [here](https://wowroms.com/en/roms/sega-genesis-megadrive/street-fighter-ii-special-champion-edition-europe/26496.html). Just put it in a ROMs folder and do `extract here`.

## Setup SF2 For Gym-Retro

In [1]:
# Retro just allows us to interface with the game ROM through the emulator
import retro
# Import time so we can control the speed of the game manually
import time

## Setup Environment For Training

### What we're going to do
- Preprocess Observations (condense the info passed to the agent) - grayscale, frame delta, resize the frame so we have less pixels.
- Filter the action - parameter
- Custom reward function. For our purposes we are just going to do the score.

In [2]:
# Base environment classe for a wrapper
from gym import Env
# The space shapes
from gym.spaces import MultiBinary, Box
# Helper libraries for preprocessing
import numpy as np
import cv2

from matplotlib import pyplot as plt

In [3]:
# Our custom environment
class StreetFighter(Env):
    # Constructor
    def __init__(self):
        super().__init__()
        # Specify action and observation spaces
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # Start an instance of the game
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
    
    def reset(self):
        # Gets the first frame
        obs = self.game.reset()
        # Preprocess the data
        obs = self.preprocess(obs)
        # At the start, just make the prev frame the first frame as well
        self.previous_frame = obs
        # Create an attribute to hold the score delta
        self.score = 0  
              
        return obs
    
    def preprocess(self, obs):
        # Grayscale the frame
        grayed = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY)
        # Resize the frame
        resized = cv2.resize(grayed, (84, 84), interpolation=cv2.INTER_CUBIC)
        # Add a color channels value to resized
        processed = np.reshape(resized, (84, 84, 1))
        
        return processed
    
    # Frame-step
    def step(self, action):
        # Take a step
        obs, def_reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        # Frame delta
        # frame_delta = obs - self.previous_frame
        frame_delta = obs
        self.previous_frame = obs
        # Reshape the reward function
        reward = info['score'] - self.score
        self.score = info['score']
        
        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
    
    def close(self):
        self.game.close()

## More Dependencies

In [None]:
# Install the packages we need to do our reinforcement learning
# Pytorch is the base ML framework
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
# Stable-Baselines has a lot of stuff for reinforcement learning specifically
!pip install stable-baselines3[extra]
# Optuna is used to help us tune our hyperparameters efficiently
!pip install optuna

In [4]:
# Used for hyper-param optimization
import optuna
# PPO algorithm for RL
from stable_baselines3 import PPO
# Used to evaluate how well the model is performing on this environment
from stable_baselines3.common.evaluation import evaluate_policy
# Used for logging
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize our frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

import os

  from .autonotebook import tqdm as notebook_tqdm


### Hyperparameter Tuning
This isn't the actual training, this is just an easy way for us to determine the best hyperparam values for PPO in this application.

In [9]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

# Function to return the test hyperparams
def OptimizePPO(trial: optuna.trial.Trial):
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192),
        'gamma': trial.suggest_loguniform('gamma', .8, .9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-7, 1e-6),
        'clip_range': trial.suggest_uniform('clip_range', .1, .4),
        'gae_lambda': trial.suggest_uniform('gae_lambda', .8, .99)
    }

In [8]:
# Run a training loop and return mean reward
def OptimizeAgent(trial: optuna.trial.Trial):
    try:
        params = OptimizePPO(trial)

        # Create environment
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        # Create the model
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **params)
        # Train. Obviously, more timesteps means better results. 100000 would be the ideal number but
        # we will just do 30000 for now so we can see results quickly.
        model.learn(total_timesteps=300000, tb_log_name='LR_TEST')
        
        # Evaluate the model. n_eval_episodes is the number of games the model plays. Should be like 20 to 30.
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
        env.close()
        
        # Save the model so we don't have to do this again
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    
    except Exception as e:
        if env != None: env.close()
        return -1000

In [None]:
# Create the experiment
study = optuna.create_study(direction='maximize')
# Optimize our boy. n_trials would ideally be higher (like 100), but we want results so for now we will do much less (like 10).
# Also, if we install the retro wrapper that allows parallelization we could do more than 1 job at a time and
# train MUCH faster, so should prob look into that.
study.optimize(func=OptimizeAgent, n_trials=5, n_jobs=1)

In [None]:
# Get the best params to use in the actual training. This is super handy!
study.best_params

In [None]:
# Get the best trial we had in our testing
study.best_trial

In [10]:
l_rates = [5e-7, 5e-6, 5e-5, 5e-4, 5e-3]

for rate in range(len(l_rates)):
    # Investigate learning rates
    params = {
        'n_steps': 4096,
        'gamma': .9,
        'learning_rate': l_rates[rate],
        'clip_range': .25,
        'gae_lambda': .9
    }

    print("Testing l_rates[{}] : {}".format(rate, l_rates[rate]))

    # Create environment
    env = StreetFighter()
    env = Monitor(env, LOG_DIR)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')
    
    # Create the model
    model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **params)
    # Train. Obviously, more timesteps means better results. 100000 would be the ideal number but
    # we will just do 30000 for now so we can see results quickly.
    model.learn(total_timesteps=300000, tb_log_name='LR_TEST')
    
    # Evaluate the model. n_eval_episodes is the number of games the model plays. Should be like 20 to 30.
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
    env.close()
    
    # Save the model so we don't have to do this again
    SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(rate))
    model.save(SAVE_PATH)

KeyboardInterrupt: 

In [None]:
# We can load up our best trial and see what he can do with real training! This is why we are saving them.
model = PPO.load(os.path.join(OPT_DIR, 'trial_10_best_model.zip'))

## Set Up Our Training Callback

In [None]:
# Import base callback
from stable_baselines3.common.callbacks import BaseCallback

In [None]:
class TrainingAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainingAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
            
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [None]:
CHECKPOINT_DIR = './train/'

callback = TrainingAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

## Train the Model!
### Viewing the performance with tensorboard
To do so, just cd into the logs directory and run this: `tensorboard --logdir=.` . Tensorboard will run on a localhost that you can view in your browser.

In [None]:
# Create environment
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
# If we haven't run a study, use these (we found them to be best)
params = {'n_steps': 2570.949, 'gamma': 0.906, 'learning_rate': 2e-07, 'clip_range': 0.369, 'gae_lambda': 0.891}
params['n_steps'] = int(np.round(params['n_steps'] / 64)) * 64
params['learning_rate'] = 5e-7

In [None]:
# Get the best params from the study
params = study.best_params
# Correct n_steps to be a factor of the batch size (64)
params['n_steps'] = int(np.round(study.best_params['n_steps'] / 64)) * 64
# Let's use a lower learning rate
# params['learning_rate'] = 5e-7

In [None]:
# Create the model with the optimal params
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **params)

 Only pick 1 (or no) load to do (duh).

In [None]:
# Load our best performing model from hyperparam optimization for a head start
model.load(os.path.join(OPT_DIR, 'trial_{}_best_model.zip'.format(int(9))))

In [None]:
model = PPO.load(os.path.join(OPT_DIR, 'akash','trial_{}_best_model.zip'.format(int(5))))

In [None]:
# Load the model with the best performance we saw.
model.load('./train/best_model_1500000.zip')

The actual training.

In [None]:
# Obviously, more timesteps means better performance. Like 5000000 would be a good number.
model.learn(total_timesteps=2000000, callback=callback, reset_num_timesteps=False)

## Evaluate the Model

In [None]:
# Load the model with the best performance we saw.
model.load('./train/from_tut/best_model_5460000.zip')

In [None]:
# See how well the model works
mean_reward, reward_stdev = evaluate_policy(model, env, n_eval_episodes=5, render=True)

## Watch Our Boy Work!

In [None]:
# Create environment
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [11]:
# Have to close the env before starting a new one.
env.close()

In [None]:
# The game loop
num_games = 5
# Only play one game
for game in range(num_games):
    # Resest the env
    obs = env.reset()
    # Will tell us if we've died or beaten the game
    done = False
    while not done:
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        # Take a breather (60fps)
        # time.sleep(1/90)
        print(reward)