## VizDoom


In [1]:
!pip install vizdoom



In [2]:
# IMport vizdoom for the game env
from vizdoom import *
# Import random
import random
# import sleeping 
import time
    

In [3]:
# Setting up the game

game = DoomGame()
game.load_config('scenarios/basic.cfg')
game.init()

In [4]:
import numpy as np

In [5]:
actions = np.identity(3, dtype = np.uint8)

In [6]:
# THese are all of the possible actions that you can take in the environment
actions

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]], dtype=uint8)

In [7]:
random.choice(actions)

array([0, 1, 0], dtype=uint8)

In [8]:
game.new_episode()


In [9]:
game.is_episode_finished()

False

In [10]:
game.make_action(random.choice(actions))

-1.0

In [11]:
# This is running the game using complete randomness just to test the model

# Loops through 10 games
episodes = 10
for num_of_episode in range(episodes):
    game.new_episode()
    # While the game is not finished 
    while not game.is_episode_finished():
        state = game.get_state()
        # Allows us to get the image
        img = state.screen_buffer
        # This corresponds to our ammo
        info = state.game_variables
        # Takes an action
        reward = game.make_action(random.choice(actions),5)
        print("reward is : ",reward)
        # Sleeps for a bit so that we are not moving too fast to see the actions
        time.sleep(0.02)
    print("Result:" , game.get_total_reward())
    time.sleep(1)

reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  96.0
Result: 71.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
reward is :  -5.0
reward is :  -5.0
reward is :  -10.0
r

In [12]:
game.close()

In [13]:
# Now we need to make it in Open AI Gym so lets set that up

## Setting up Open AI Gym and Converting

In [14]:
!pip install gym



In [15]:
# Import environment base class from Open AI
from gym import Env

from gym.spaces import Discrete,Box

# Lets us grescale our game so that it can compute faster
import cv2

In [16]:
Discrete(3).sample()

2

In [17]:
actions[Discrete(3).sample()]

array([0, 1, 0], dtype=uint8)

In [18]:
Box(low = 0, high = 10, shape = (10,10), dtype = np.uint8).sample()

array([[ 2,  3,  2,  6,  9,  0,  4,  7,  9,  1],
       [ 4,  8,  6,  9, 10,  9,  9,  1,  3,  7],
       [ 1,  0, 10,  4,  5,  9,  1,  6,  6,  8],
       [ 8,  3,  4,  2,  1,  4,  9,  1,  7,  7],
       [ 3,  7,  9, 10, 10,  6,  7,  5,  6,  0],
       [ 7,  1,  8,  4,  9,  3,  0,  5,  5,  4],
       [ 6, 10,  0,  9,  6,  0,  9,  9,  5,  1],
       [ 9,  2,  0,  1,  6,  1,  1,  6,  5,  9],
       [ 9,  5,  9,  5,  0,  4,  0,  1,  7,  7],
       [ 2,  9,  4,  7, 10,  7,  9,  2,  8,  8]], dtype=uint8)

In [19]:
# Create Vizdoom OpenAI Gym Environment
class VizDoomGym(Env): 
    # Function that is called when we start the env
    def __init__(self, render=False): 
        # Inherit from Env
        super().__init__()
        # Setup the game 
        self.game = DoomGame()
        self.game.load_config('scenarios/basic.cfg')
        
        # Render frame logic
        if render == False: 
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)
        
        # Start the game 
        self.game.init()
        
        # Create the action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8) 
        self.action_space = Discrete(3)
        
    # This is how we take a step in the environment
    def step(self, action):
        # Specify action and take step 
        actions = np.identity(3)
        reward = self.game.make_action(actions[action], 4) 
        
        # Get all the other stuff we need to retun 
        if self.game.get_state(): 
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)
            ammo = self.game.get_state().game_variables[0]
            info = ammo
        else: 
            state = np.zeros(self.observation_space.shape)
            info = 0 
        
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info 
    
    # Define how to render the game or environment 
    def render(): 
        pass
    
    # What happens when we start a new game 
    def reset(self): 
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
    
    # Grayscale the game frame and resize it 
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
    
    # Call to close down the game
    def close(self): 
        self.game.close()

In [20]:
env = VizDoomGym(render = True)

In [21]:
state = env.reset()

In [22]:
env.reset()

array([[[55],
        [50],
        [59],
        ...,
        [57],
        [57],
        [66]],

       [[68],
        [65],
        [65],
        ...,
        [56],
        [67],
        [72]],

       [[49],
        [79],
        [66],
        ...,
        [79],
        [51],
        [29]],

       ...,

       [[75],
        [63],
        [62],
        ...,
        [44],
        [71],
        [60]],

       [[15],
        [48],
        [47],
        ...,
        [49],
        [69],
        [47]],

       [[22],
        [14],
        [26],
        ...,
        [57],
        [37],
        [39]]], dtype=uint8)

In [23]:
env.close()

In [24]:
from stable_baselines3.common import env_checker


In [25]:
#env_checker.check_env(env)

In [26]:

#from matplotlib import pyplot as plt
#plt.imshow(cv2.cvtColor(state,cv2.COLOR_BGR2RGB))

#  Setting up the Callback

In [27]:
!pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html


Looking in links: https://download.pytorch.org/whl/cu113/torch_stable.html


In [28]:
!pip install stable-baselines3[extra]




In [29]:
import os

from stable_baselines3.common.callbacks import BaseCallback

In [30]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [31]:
CHECKPOINT_DIR = './train/train_basic'
LOG_DIR = './logs/log_basic'

In [32]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)


# Let's Train this Model now 

In [33]:
from stable_baselines3 import PPO

In [34]:
# Create an environment

env = VizDoomGym()

In [35]:
# creates the model, Cnn because we are passing in images so we need a convolutional neural network
model = PPO("CnnPolicy", env, tensorboard_log = LOG_DIR, verbose = 1, learning_rate = 0.0001, n_steps = 260 )



  return torch._C._cuda_getDeviceCount() > 0
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=260 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [36]:
model.learn(total_timesteps = 100000,callback = callback)

Logging to ./logs/log_basic\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.1     |
|    ep_rew_mean     | -10.2    |
| time/              |          |
|    fps             | 11       |
|    iterations      | 1        |
|    time_elapsed    | 21       |
|    total_timesteps | 260      |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 23.9         |
|    ep_rew_mean          | -29.4        |
| time/                   |              |
|    fps                  | 13           |
|    iterations           | 2            |
|    time_elapsed         | 38           |
|    total_timesteps      | 520          |
| train/                  |              |
|    approx_kl            | 0.0038518147 |
|    clip_fraction        | 0.0228       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance 

KeyboardInterrupt: 

In [37]:
# ep_len_mean = how many frames it is on average
# ep_rew_mean = sum of average reward
# approx_kl = measures the difference between our current and previous training
# policy_gradient_loss = how well our agent can capatilize on an advantage
# value_loss = how well its able to predict the value of each state

In [38]:
# This does not seem to be training well in fact the reward is not going up

In [39]:
model = PPO("CnnPolicy", env, tensorboard_log = LOG_DIR, verbose = 1, learning_rate = 0.0001, n_steps = 2048 )


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [40]:
model.learn(total_timesteps = 100000,callback = callback)

Logging to ./logs/log_basic\PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.8     |
|    ep_rew_mean     | -75      |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 62       |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 31.5       |
|    ep_rew_mean          | -77.1      |
| time/                   |            |
|    fps                  | 21         |
|    iterations           | 2          |
|    time_elapsed         | 189        |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00792253 |
|    clip_fraction        | 0.15       |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.09      |
|    explained_variance   | -0.000161  |
|    learni

KeyboardInterrupt: 

In [None]:
# Testing on a new model with better n_steps so hopefully it will learn better than the previous model
# Seems to have worked much better with a higher n_steps
# This is taking too long to keep running and the reward is already quite good after only half of the timesteps


## Test the model

In [56]:
# reload are best mode 
model = PPO.load('./train/train_basic/best_model_60000')

In [57]:
from stable_baselines3.common.evaluation import evaluate_policy

In [58]:
env = VizDoomGym(render = True)

In [59]:
mean_reward, _= evaluate_policy(model,env,n_eval_episodes = 100)

In [60]:
mean_reward

87.61

In [49]:
# Test the first model

In [50]:
model = PPO.load('./train/train_basic/best_model_10000')

In [52]:
mean_reward, _= evaluate_policy(model,env,n_eval_episodes = 50)

In [53]:
mean_reward

-87.38