In [1]:
import os
from datetime import datetime
run_start_time = datetime.now().strftime('%Y%m%d_%H%M%S')

In [2]:
from torch.utils.tensorboard import SummaryWriter
import gymnasium as gym
import torch
from PPO.PPO import PPO
import numpy as np
from collections import deque

Device set to : NVIDIA GeForce GTX 1080 Ti


In [3]:
from AlienEnv.alienrl_env import AlienRLEnv

In [4]:
env_name = "AlienRLEnv"

In [5]:
# env = gym.make(env_name)
env = AlienRLEnv()

In [6]:
obs, _ = env.reset()

In [7]:
# Not implemented in current version, currently, batch_size = buffer_size
batch_size = 1024

max_training_timesteps = 1_000_000  # break training loop if timesteps > max_training_timesteps

print_freq = 50 # batch_size * 10
save_model_freq = 50_000

# Starting standard deviation for action distribution
action_sd = 0.6
# Linearly decay action_sd where, action_sd = action_sd - action_sd_decay_rate
action_sd_decay_rate = 0.05        
# Set minimum action standard deviation
min_action_sd = 0.1                
# action standard devation decay frequency
action_sd_decay_freq = 250000

# Batch/buffer size for training, should be multiple of batch_size
# buffer_size = batch_size * 1  # 1024 - Converged faster, at 300k timesteps (ent_coef = 0.0)
buffer_size = batch_size * 4  # 4096 - Converged at 500k timesteps (ent_coef = 0.001)
# buffer_size = batch_size * 40 # 40960 - Converges at much slower rate and stable rate

# Update policy for n epochs
num_of_epochs = 128 # 80

eps_clip = 0.2
gamma = 0.99
lr_actor = 0.0003
lr_critic = 0.001
ent_coef = 0.0 # 0.001 # Increasing entropy coefficient helps exploration, 0 seems to be the best value
vf_coef = 0.5

state_dim = sum(env.observation_space['framestack'].shape) + env.observation_space['telemetry'].shape[0]

# action space dimension
action_dim = env.action_space.shape[0]

In [8]:
checkpoint_path = "models" + '/' + f"{run_start_time}_{ent_coef}" + "/"

if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)

logs_dir = f"runs/{run_start_time}_{ent_coef}"

writer = SummaryWriter(logs_dir)

In [9]:
# initialize a PPO agent
agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, num_of_epochs, eps_clip, ent_coef, vf_coef, action_sd)

In [10]:
print("Initialisation complete.")

# track total training time
start_time = datetime.now().replace(microsecond=0)
print("Training started at: ", start_time)

# printing and logging variables
total_episodes = 0

global_step_num = 0
episode_num = 1

reward_history = deque(maxlen=100)
best_reward = env.reward_range[0]

Initialisation complete.
Training started at:  2023-07-28 03:12:10


In [11]:
# training loop
while global_step_num <= max_training_timesteps:

    state, _ = env.reset()
    episode_reward = 0
    done = False
    trunc = False

    while not done and not trunc:
        
        # Select action with policy
        action = agent.select_action(state)
        state, reward, done, trunc, _ = env.step(action)

        # Saving reward and is_terminals
        agent.buffer.rewards.append(reward)
        agent.buffer.is_terminals.append(done or trunc)

        global_step_num += 1
        episode_reward += reward

        # Update agent
        if global_step_num % buffer_size == 0:
            agent.update()

        # Decay action std of ouput action distribution
        if global_step_num % action_sd_decay_freq == 0:
            agent.decay_action_sd(action_sd_decay_rate, min_action_sd)

        if global_step_num % save_model_freq == 0:
            print("Saving model.")
            agent.save(f"{checkpoint_path}{global_step_num}.pth")

    reward_history.append(episode_reward)
    avg_reward = np.mean(reward_history)

    if avg_reward > best_reward and len(reward_history) >= 100:
        best_reward = avg_reward
        
    
    if (total_episodes+1) % print_freq == 0:
        print(f"Episode: {episode_num} \t Total Steps: {global_step_num} \t Average Reward: {avg_reward:.02f} \t Best Reward: {best_reward:.02f}, \t Elapsed Time: {datetime.now().replace(microsecond=0) - start_time}")

    writer.add_scalar('Reward', episode_reward, global_step=global_step_num)
    writer.add_scalar('Average Reward', avg_reward, global_step=global_step_num)

    total_episodes += 1

    episode_num += 1

writer.close()
env.close()

end_time = datetime.now().replace(microsecond=0)
print()
print("Started training at: ", start_time)
print("Finished training at: ", end_time)
print("Total training time: ", end_time - start_time)

torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<class 'torch.Tensor'>
torch.Size([1, 12, 84, 84])
<cl

KeyboardInterrupt: 

In [22]:
fs = torch.rand(4,3,84,84)
fs.shape

torch.Size([4, 3, 84, 84])

In [24]:
fs_ft = torch.FloatTensor(obs['framestack'])
fs_ft.shape

torch.Size([4, 3, 84, 84])

In [25]:
fs_ft = fs_ft.view(12, 84, 84)
fs_ft.shape

torch.Size([12, 84, 84])

In [48]:
import torch.nn as nn
class ActorNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorNetwork, self).__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(12, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((7, 7)),
            nn.Flatten(),
            nn.Linear(3136, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(512, 128),
            nn.ReLU()
        )
        
        self.fcn1 = nn.Sequential(
            nn.Linear(20, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(128, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(p=0.2)
        )
        
        self.fc2 = nn.Sequential(
            nn.Linear(128 * 2, 512),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(256, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
            nn.Tanh(),
        )

    def forward(self, obs):
        framestack = obs['framestack']
        telemetry = obs['telemetry']
        print(framestack.shape)
        print(type(telemetry))
        framestack_embedding = self.cnn(framestack)
        telemetry_embedding = self.fcn1(telemetry)

        if len(telemetry_embedding.shape) == 1:
            telemetry_embedding = telemetry_embedding.unsqueeze(0)

        concatenated = torch.cat((framestack_embedding, telemetry_embedding), dim=1)

        return self.fc2(concatenated)

In [49]:
actor = ActorNetwork(1,1)

In [50]:
fs_ft.shape

torch.Size([12, 84, 84])

In [51]:
actor.cnn(fs_ft).shape

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x49 and 3136x1024)

In [46]:
actor.cnn(fs_ft).reshape(-1, 3136).shape

torch.Size([1, 3136])

In [52]:
test_data = torch.rand(1, 12, 84, 84)

In [53]:
actor.cnn(test_data).shape

torch.Size([1, 128])