# Train the agent to play Street Fighter with A2C Algorithm
## Train and Evaluate
Train the A2C model and evaluate its performance.
## Set Hyperparameters
Set the hyperparameters obtained from the best trail in PPO experiment and adjust for A2C.

In [1]:
import os
import numpy as np
import cv2
import retro
from gym import Env
from gym.spaces import Box, MultiBinary
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from street_fighter_env import StreetFighter


# Hyperparameters
def get_hyperparameters():
    return {
        'n_steps': 2819,
        'gamma': 0.8591916176596557,
        'learning_rate': 5e-7
    }

# Environment Setup
def create_environment():
    env = StreetFighter()
    env = Monitor(env, "./logs/")
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')
    return env

# Model Training
def train_agent(hyperparameters):
    try:
        env = create_environment()

        # Create algorithm
        model = A2C(
            'CnnPolicy',
            env,
            tensorboard_log="./logs/tensorboard-A2C/",
            verbose=1,
            n_steps=hyperparameters['n_steps'],
            gamma=hyperparameters['gamma'],
            learning_rate=hyperparameters['learning_rate']
        )

        # Train the model
        model.learn(total_timesteps=1000000)

        # Save the model
        save_path = os.path.join("./models/", 'a2c_streetfighter_model')
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        model.save(save_path)
        print(f"Model saved to {save_path}")

        # Evaluate model
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        return mean_reward

    except Exception as e:
        print(f"Error during training: {e}")
        return -1000

# Main Execution
if __name__ == "__main__":
    hyperparameters = get_hyperparameters()
    mean_reward = train_agent(hyperparameters)
    print(f"Mean reward: {mean_reward}")


  from .autonotebook import tqdm as notebook_tqdm


ROM path: c:\Users\marya\Documents\Courses\RL-project-streetFighter\.venv\lib\site-packages\retro/data\stable\StreetFighterIISpecialChampionEdition-Genesis\rom.md
Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to ./logs/tensorboard-A2C/A2C_2
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 1.02e+04  |
|    ep_rew_mean        | 2.94e+04  |
| time/                 |           |
|    fps                | 158       |
|    iterations         | 100       |
|    time_elapsed       | 1782      |
|    total_timesteps    | 281900    |
| train/                |           |
|    entropy_loss       | -8.32     |
|    explained_variance | -3.24e-05 |
|    learning_rate      | 5e-07     |
|    n_updates          | 99        |
|    policy_loss        | 207       |
|    value_loss         | 4.55e+04  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean   

## Evaluate the model
Load the pre trained model A2C, to play the game

In [None]:
from gym import Env
from gym.spaces import Discrete, Box, MultiBinary
import numpy as np
import cv2
model = PPO.load('./models/a2c_streetfighter_model')

In [None]:
env.close()
env = StreetFighter()
env = Monitor(env)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
for episode in range(3): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        # time.sleep(0.01)
        total_reward += reward
    print('Total Reward is {} for episode {}'.format(total_reward, episode))
    time.sleep(2)