Author: Christos Christidis

# Imports

Use: tensorboard --logdir=runs to run the Tensorboard session and view the results if the images below don't work.

In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from collections import deque
import random
import time

In [2]:
gym.__version__

'0.26.2'

# Settings

In [3]:
parameters = {
"num_episodes" :  1000, #1,  # Change to 1000, 2000
"batch_size" : 32,
"gamma" : 0.9, #0.99, # Discount factor
"epsilon_start" : 0.4, # 1.0, # Try smaller epsilon start for less exploration
"epsilon_end" : 0.01,
"epsilon_decay" : 0.995, # Not used
"target_update" : 10,
"device" :  torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
"learning_rate" : 0.003 #0.005
}

In [4]:
parameters['device']

device(type='cuda')

# Model

In [5]:
#%pip install "gym[atari, accept-rom-license]"

In [6]:
import gym

In [7]:
from ale_py import ALEInterface
ale = ALEInterface()

In [8]:
class CNNPolicy(nn.Module):
    def __init__(self, input_channels, output_dim):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=3, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=1, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=1, stride=1)
        self.fc1 = nn.Linear(1280, 512) #(3136, 512) # Based on RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x1280 and 3136x512), we change from 3136 to 1280
        self.fc2 = nn.Linear(512, output_dim)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Environment and model initialization

In [9]:
# Initialize environment
env = gym.make('Assault-v4') # render_mode='human' <-- Crashes!!! # Try with a different game too, it's plug and play! v4 has no repeated action enabled
#env = gym.make('Centipede-v4') 
input_channels = env.observation_space.shape[0] # Input for the cnns should be states
output_dim = env.action_space.n # Output of the cnns should be an action

In [10]:
input_channels

210

In [11]:
# 9 available actions
output_dim

7

In [12]:
#Observation space: (lower bound, upper bound, shape, dtype)
env.observation_space#.shape

Box(0, 255, (210, 160, 3), uint8)

In [13]:
# Initialize policy and target networks
policy_net = CNNPolicy(input_channels, output_dim).to(parameters["device"])
target_net = CNNPolicy(input_channels, output_dim).to(parameters["device"])
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

CNNPolicy(
  (conv1): Conv2d(210, 32, kernel_size=(3, 3), stride=(4, 4))
  (conv2): Conv2d(32, 64, kernel_size=(1, 1), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
  (fc1): Linear(in_features=1280, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=7, bias=True)
)

In [14]:
optimizer = optim.Adam(policy_net.parameters(), lr=parameters['learning_rate'])
replay_buffer = deque(maxlen=32)
epsilon = parameters['epsilon_start']
epsilon_decay_step = (parameters['epsilon_start'] - parameters['epsilon_end']) / parameters['num_episodes']

# Training loop

According to https://openreview.net/pdf/75f0008512b0ab359f0fdbba5551d26760b7bce8.pdf, during the training phase should be used at most 200M frames and end the episode when all lives are lost or the episode exceeds 30mins. Also it is suggested that the agents are trained on at least 10m steps. Due to resource limitations such great lengths of episodes were not met.

tensorboard --logdir=runs

* TODO: Assault - Let train for 5m steps ~10k episodes <- leave overnight
* TODO: Run the following one more time with the initial params
* TODO: Try to train with Stable Baseline (e.g. PPO ) and compare results

In [164]:
writer = SummaryWriter()
steps = 0

for episode in range(parameters['num_episodes']):
    state = env.reset(seed=42)[0] # env.reset() returns observation state + info so we only keep the state
    #print(state)
    total_reward = 0
    episode_length = 0
    policy_loss_sum = 0
    value_loss_sum = 0
    done = False
    fault = False

    while not done:
        episode_length += 1
        #try:
        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            action = env.action_space.sample() # Select a random action to do
            #print("Selected random")
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(parameters['device'])
                q_values = policy_net(state_tensor)
                action = q_values.argmax().item()  # Select the best action
                #print("Selected best action")
        '''except ValueError as e:
            print("Found ValueError: expected sequence of length 210 at dim 1 (got 3), skipping to the next episode!")
            fault = True
            break'''
        

        next_state, reward, done, _, _ = env.step(action)  # Returns 5: sample of obs_space (next state), reward, terminated we need and truncated, info which we don't need
        total_reward += reward
        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state
        
        # Perform gradient descent step
        if len(replay_buffer) > parameters['batch_size']:
            print("replay_buffer > 32")
            batch = random.sample(replay_buffer, parameters['batch_size'])
        else:
            batch = replay_buffer
        states, actions, rewards, next_states, dones = zip(*batch)
        
        # Sometimes the state is returned either as a tuple or as an ndarray. The issue rises when it's returned as a tuple.
        has_tuple = any(isinstance(element, tuple) for element in states)
        if has_tuple:
            continue # Simply skip the loop that contains faulty data.
        

        # Creating tensors but need to be converted into ndarrays first for faster loading.
        states = torch.from_numpy(np.array(states)).to(parameters['device'])/255 # Normalizing the tensor which will also convert it into float32 which the model accepts
        actions = torch.from_numpy(np.array(actions)).to(parameters['device'])
        rewards = torch.from_numpy(np.array(rewards)).to(parameters['device']).float()
        next_states = torch.from_numpy(np.array(next_states)).to(parameters['device'])/255
        dones = torch.from_numpy(np.array(dones)).to(parameters['device'])

        q_values = policy_net(states).gather(1, actions.type(torch.int64).unsqueeze(1)).squeeze(1) # Changing the data type of actions tensor before passing it to torch.gather
        next_q_values = target_net(next_states).max(1)[0].detach()
        target_q_values = rewards + (~dones) * parameters['gamma'] * next_q_values
        
        loss = F.mse_loss(q_values, target_q_values) 
        loss = loss.type(torch.float32)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        steps+=1 
        
        policy_loss_sum += loss.item()
        value_loss_sum += loss.item()

    
    
    if not fault:
        # Decay epsilon
        epsilon = max(parameters['epsilon_end'], epsilon - epsilon_decay_step)

        # Update target network periodically
        if episode % parameters['target_update'] == 0:
            target_net.load_state_dict(policy_net.state_dict())

        # Logging on TensorBoard
        writer.add_scalar('Training/Cumulative_Reward', total_reward, episode)
        writer.add_scalar('Training/Episode_Length', episode_length, episode)
        writer.add_scalar('Training/Policy_Loss', policy_loss_sum / episode_length, episode)
        writer.add_scalar('Training/Value_Loss', value_loss_sum / episode_length, episode)

        # Print episode information
        print(f'Episode {episode + 1}: Total Reward: {total_reward}, Episode Length: {episode_length}')


writer.close()

Episode 1: Total Reward: 252.0, Episode Length: 906
Episode 2: Total Reward: 189.0, Episode Length: 1022
Episode 3: Total Reward: 357.0, Episode Length: 980
Episode 4: Total Reward: 168.0, Episode Length: 685
Episode 5: Total Reward: 252.0, Episode Length: 731
Episode 6: Total Reward: 315.0, Episode Length: 791
Episode 7: Total Reward: 231.0, Episode Length: 811
Episode 8: Total Reward: 231.0, Episode Length: 970
Episode 9: Total Reward: 189.0, Episode Length: 874
Episode 10: Total Reward: 168.0, Episode Length: 1022
Episode 11: Total Reward: 147.0, Episode Length: 1302
Episode 12: Total Reward: 315.0, Episode Length: 895
Episode 13: Total Reward: 399.0, Episode Length: 885
Episode 14: Total Reward: 336.0, Episode Length: 801
Episode 15: Total Reward: 378.0, Episode Length: 855
Episode 16: Total Reward: 168.0, Episode Length: 815
Episode 17: Total Reward: 147.0, Episode Length: 688
Episode 18: Total Reward: 189.0, Episode Length: 927
Episode 19: Total Reward: 399.0, Episode Length: 138

* Assault - Explore - Total number of steps in 1000 episodes: 507143
* Assault - Explore - Total number of steps in 2000 episodes: 980745
* Centipede - Explore - Total number of steps in 1000 episodes: 714922

* Assault - Force Exploitation - Total number of steps in 1000 episodes: 849296


In case the imaged below don't show, use tensorboard --logdir=runs on the command line after training the model yourself to see the results.

In [165]:
print("Total number of steps: ", steps)

Total number of steps:  849296


In [16]:
#from IPython.display import Image
#Image(filename="./Logs/first test run 100 episodes.png")

In [17]:
#Image(filename="./Logs/1000 ep run.png") 

Saving models

In [166]:
file_path = '1k_assault_forced_exploit_policy_cnn.pth'
torch.save(policy_net.state_dict(), file_path)
file_path = '1k_assault_forced_exploit__target_cnn.pth'
torch.save(target_net.state_dict(), file_path)
print("Models saved successfully!")

Models saved successfully!


# Loading models

In [23]:
import os
path = './trained states' # Might need to edit depending on where you save.
file_name = "1k_assault_policy_cnn.pth" #'1k_assault_policy_cnn.pth'
pth_file_path = os.path.join(path, file_name)
pre_trained_policy_cnn = CNNPolicy(input_channels, output_dim) # Need to make the environment first to get the input and output dim <- Run Model and Environment Initialization
pre_trained_policy_cnn.load_state_dict(torch.load(pth_file_path))
pre_trained_policy_cnn.to(parameters['device']) # Need to run settings first to get the device <- Check section "Settings"
print("Policy CNN loaded successfully!\n")

Policy CNN loaded successfully!



In [None]:
pre_trained_policy_cnn.state_dict()

# Visualizing the trained agents

If you want to take a look on how the environment looks, that's the way to do it below. You can include it in the training loop but it might slow the training process, especially for a large number of episodes. 
* Disclaimer: It might cause the kernel to crash if you use VSCode once it's finished or you manually close it. 

In [24]:
import gym

pre_trained_policy_cnn.eval()
env = gym.make("Assault-v4", render_mode="human")
env.action_space.seed(42)

observation, info = env.reset(seed=42)
state_tensor = torch.FloatTensor(observation).unsqueeze(0).to(parameters['device'])
q_values = policy_net(state_tensor)
action = q_values.argmax().item()  # Select the best action

for _ in range(2000):
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

  return F.conv2d(input, weight, bias, self.stride,
  if not isinstance(terminated, (bool, np.bool8)):


: 