# 1. Setup ViZDoom

In [1]:
!pip install vizdoom

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!cd github_clone & git clone https://github.com/Farama-Foundation/ViZDoom

fatal: destination path 'ViZDoom' already exists and is not an empty directory.


In [3]:
# ViZDoom for game env
from vizdoom import *
# Random for action sampling
import random
# Time for sleeping
import time
# Identity matrix
import numpy as np

In [4]:
# Setup game
game = DoomGame()
game.load_config('github_clone/ViZDoom/scenarios/deadly_corridor.cfg')
game.init()

In [5]:
# Define the set of actions within this Doom env
actions = np.identity(7, dtype=np.uint8)

In [6]:
episodes = 10

for episode in range(episodes):
    # Each episode is a replay of a game
    game.new_episode()
    
    while not game.is_episode_finished():
        state = game.get_state()
        img = state.screen_buffer
        # Ammo
        info = state.game_variables
        # Take action, skip next 4 frames, and return the result (better for our agent to see the action take place)
        reward = game.make_action(random.choice(actions), 4)
        print('reward: ', reward)
        time.sleep(0.02)
    print('Result: ', game.get_total_reward())
    time.sleep(2)

reward:  0.0
reward:  0.0
reward:  -7.1104888916015625
reward:  -1.4199066162109375
reward:  -7.2936553955078125
reward:  0.0
reward:  0.0
reward:  0.0
reward:  0.0
reward:  0.0
reward:  6.46514892578125
reward:  10.59967041015625
reward:  3.42364501953125
reward:  -2.61199951171875
reward:  3.2097320556640625
reward:  -2.2396087646484375
reward:  -11.708999633789062
reward:  -7.2717742919921875
reward:  2.8099212646484375
reward:  4.364898681640625
reward:  -2.2844696044921875
reward:  -4.7718658447265625
reward:  -0.13128662109375
reward:  3.4748687744140625
reward:  10.190567016601562
reward:  13.634933471679688
reward:  11.326904296875
reward:  12.817367553710938
reward:  7.3311614990234375
reward:  4.8975372314453125
reward:  4.57183837890625
reward:  9.548873901367188
reward:  9.835830688476562
reward:  10.87945556640625
reward:  -0.0059661865234375
reward:  -3.183074951171875
reward:  -2.1471710205078125
reward:  -1.448394775390625
reward:  -0.9770660400390625
reward:  6.4511718

ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.

In [None]:
game.close()

# 2. Convert to Gym Environment

In [7]:
!pip install gym

Defaulting to user installation because normal site-packages is not writeable


In [8]:
# Env base class
from gym import Env
# Gym spaces
# Discrete: Represents our actions
# Box: An array of any shape
from gym.spaces import Discrete, Box
# OpenCV
import cv2

In [9]:
# ViZDoom env
class VizDoomGym(Env):
    
    # Start the env
    def __init__(self, render=False, config='github_clone/ViZDoom/scenarios/deadly_corridor.cfg'):
        # Inherit from our Env class
        super().__init__()
        # Setup game
        self.game = DoomGame()
        self.game.load_config(config)
        
        # Allow us to train the model without having to open a new window each time
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)
        
        self.game.init()
        
        # Create spaces
        # game.get_state().screen_buffer.shape
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8)
        self.action_space = Discrete(7)
        
        # Game variables: HEALTH DAMAGE_TAKEN HITCOUNT SELECTED_WEAPON_AMMO
        self.damage_taken = 0
        self.hitcount = 0
        self.ammo = 52
        
    # Represents a step in the env
    def step(self, action):
        # Specify action and take step
        actions = np.identity(7)
        movement_reward = self.game.make_action(actions[action], 4)
        reward = 0
        
        if self.game.get_state():
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)
            
            # Reward shaping
            game_variables = self.game.get_state().game_variables
            health, damage_taken, hitcount, ammo = game_variables
            
            # Calculate the reward changes
            damage_taken_delta = -damage_taken + self.damage_taken
            self.damage_taken = damage_taken
            hitcount_delta = hitcount - self.hitcount
            self.hitcount = hitcount
            ammo_delta = ammo - self.ammo
            self.ammo = ammo
            
            # Weighted sum with different weight biase(s)
            reward = movement_reward + damage_taken_delta*10 + hitcount_delta*200 + ammo_delta*5
            info = ammo
        else:
            state = np.zeros(self.observation_space.shape)
            info = 0
            
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info
    
    # Render the env
    def render():
        pass
    
    # New game
    def reset(self):
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
    
    # Grayscale the game frame and resize it
    # Reshaping the array into the form that cvtColor expects it in (channel should be at the end)
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation,0,-1), cv2.COLOR_BGR2GRAY)
        # Scales it down so less pixels to process
        # Reducing what does not bring value when come time to train the model
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        
        return state
    
    # Close the env
    def close(self):
        self.game.close()

In [None]:
env = VizDoomGym(render=True)

In [None]:
env.close()

In [10]:
# Env checker
from stable_baselines3.common import env_checker

In [None]:
env_checker.check_env(env)

In [None]:
!pip install matplotlib
from matplotlib import pyplot as plt
#plt.imshow(state)

# 3. Setup Callback

In [11]:
!pip install torch torchvision torchaudio
# PPO for RL
!pip install stable-baselines3[extra]

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [12]:
# OS for file
import os
# Callback
from stable_baselines3.common.callbacks import BaseCallback

In [13]:
# Save our model every x number of steps
class TrainAndLoggingCallback(BaseCallback):
    
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
            
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
            
        return True

In [14]:
CHECKPOINT_DIR = './train/train_corridor'
LOG_DIR = './logs/log_corridor'

In [15]:
callback = TrainAndLoggingCallback(check_freq=200000, save_path=CHECKPOINT_DIR)

# 4. Train the Model Using Curriculum Learning

In [16]:
# PPO
from stable_baselines3 import PPO

In [None]:
# No render
env = VizDoomGym(config='github_clone/ViZDoom/scenarios/deadly_corridor_s1.cfg')

In [None]:
# Check 'scenarios/basic.cfg' in the ViZDoom GitHub repository and put steps somewhat to the maximum actions defined in a single game
# The agent will take 8192 (a multiple of the base-64 batch size) sets of observations, actions, log probs, values store in the buffer for one iteration 
# Such a high steps to decrease the clip range value
# Gamma: Discount factor
# Gae: Smoothing parameter to calculate our advantage
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.00001, n_steps=8192, clip_range=0.10, gamma=0.95, gae_lambda=0.90)

In [None]:
model.learn(total_timesteps=400000, callback=callback)

In [None]:
model.load('./train/train_corridor/best_model_260000')

In [None]:
env = VizDoomGym(config='github_clone/ViZDoom/scenarios/deadly_corridor_s2.cfg')
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

In [None]:
env = VizDoomGym(config='github_clone/ViZDoom/scenarios/deadly_corridor_s3.cfg')
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

In [None]:
env = VizDoomGym(config='github_clone/ViZDoom/scenarios/deadly_corridor_s4.cfg')
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

In [None]:
env = VizDoomGym(config='github_clone/ViZDoom/scenarios/deadly_corridor_s5.cfg')
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

# 5. Test the Model

In [17]:
# Evaluation policy to test agent
from stable_baselines3.common.evaluation import evaluate_policy

In [18]:
model = PPO.load('./train/train_corridor/best_model_560000')

Exception: 'bytes' object cannot be interpreted as an integer
Exception: 'bytes' object cannot be interpreted as an integer
	Missing key(s) in state_dict: "pi_features_extractor.cnn.0.weight", "pi_features_extractor.cnn.0.bias", "pi_features_extractor.cnn.2.weight", "pi_features_extractor.cnn.2.bias", "pi_features_extractor.cnn.4.weight", "pi_features_extractor.cnn.4.bias", "pi_features_extractor.linear.0.weight", "pi_features_extractor.linear.0.bias", "vf_features_extractor.cnn.0.weight", "vf_features_extractor.cnn.0.bias", "vf_features_extractor.cnn.2.weight", "vf_features_extractor.cnn.2.bias", "vf_features_extractor.cnn.4.weight", "vf_features_extractor.cnn.4.bias", "vf_features_extractor.linear.0.weight", "vf_features_extractor.linear.0.bias".  


In [21]:
env = VizDoomGym(render=True, config='github_clone/ViZDoom/scenarios/deadly_corridor_s1.cfg')

In [20]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)



ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.

In [22]:
for episode in range(20):
    observation = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        action, _ = model.predict(observation)
        observation, reward, done, info = env.step(action)
        #time.sleep(0.02)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(episode, total_reward))
    time.sleep(2)

Total Reward for episode 0 is 2586.8333587646484
Total Reward for episode 1 is -495.2959442138672
Total Reward for episode 2 is 637.2235870361328
Total Reward for episode 3 is 2015.914077758789


ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.