In [1]:
from dataloader import AtariDataset
import gym
import torch
import numpy as np
import random
import os

from dqn import DQN
import dqn

def reseed(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
seed = 42
reseed(seed)

def make_env(env_id, seed=25):
    env = gym.make(env_id, obs_type='grayscale', render_mode=None, repeat_action_probability=0.15,frameskip=1)
    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env
env = make_env("SpaceInvaders-v0", seed=seed)
print(env.action_space.n)
print(env.observation_space.shape)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

6
(210, 160)
cuda:0


In [None]:
def visualize(learner, env, video_name="test"):
    """Visualize a policy network for a given algorithm on a single episode

        Args:
            algorithm (PolicyGradient): Algorithm whose policy network will be rolled out for the episode. If
            no algorithm is passed in, a random policy will be visualized.
            video_name (str): Name for the mp4 file of the episode that will be saved (omit .mp4). Only used
            when running on local machine.
    """

    import cv2

    print("Visualizing")

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    video = cv2.VideoWriter(f"{video_name}.avi", fourcc, 24, (160,210), isColor = True)
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)

        total_reward += reward

        if done:
            break

        im = env.render(mode='rgb_array')
        
        video.write(im)

    video.release()
    env.close()
    print(f"Video saved as {video_name}.avi")
    print("Reward: " + str(total_reward))

## LOAD ATARI DATA

In [4]:
dataloader = AtariDataset("atari_v1", 15)
observations, actions, rewards, next_observations, dones = dataloader.compile_data()

np.save('numpy_data/atari/observations', observations)
np.save('numpy_data/atari/actions', actions)
np.save('numpy_data/atari/rewards', rewards)
np.save('numpy_data/atari/dones', dones)
np.save('numpy_data/atari/next_observations', next_observations)

15
[1960, 1870, 1770, 1705, 1700, 1685, 1665, 1660, 1660, 1605, 1605, 1580, 1525, 1490, 1470]


In [2]:
atari_obs = np.load('numpy_data/atari/observations.npy')
atari_act = np.load('numpy_data/atari/actions.npy')
atari_rew = np.load('numpy_data/atari/rewards.npy')
atari_next = np.load('numpy_data/atari/dones.npy')
atari_done = np.load('numpy_data/atari/next_observations.npy')

## DATA COLLECTION

In [3]:
def data_collect(learner, env, num_episodes, save_path, device):
  observations = []
  actions = []
  rewards = []
  next_observations = []
  dones = []
  for _ in range(num_episodes):
      obs = env.reset()
      done = False
      while not done:
          if isinstance(learner, DQN):
            with torch.no_grad():
              action = learner.get_action(
              torch.tensor(obs).to(device).unsqueeze(0), eps=0.0
              )
          else:
             with torch.no_grad():
              action = learner.get_action(
              torch.tensor(obs).to(device).unsqueeze(0)
              )
          next_obs, reward, done, _ = env.step(action)
          observations.append(obs)
          actions.append(action)
          rewards.append(reward)
          next_observations.append(next_obs)
          dones.append(done)
          obs = next_obs
  np.save(os.path.join(save_path, 'observations'), observations)
  np.save(os.path.join(save_path, 'actions'), actions)
  np.save(os.path.join(save_path, 'rewards'), rewards)
  np.save(os.path.join(save_path, 'dones'), dones)
  np.save(os.path.join(save_path, 'next_observations'), next_observations)

## LOAD BC

In [4]:
from bc import SpaceInvLearner

bc_learner = SpaceInvLearner(env)

bc_learner.load_state_dict(torch.load('models/bc_learner.pth'))

<All keys matched successfully>

In [5]:
data_collect(bc_learner, env, 25, 'numpy_data/bc', device)

## LOAD DAGGER

In [6]:
dagger_learner = SpaceInvLearner(env)

dagger_learner.load_state_dict(torch.load('models/DAgger.pth'))

<All keys matched successfully>

In [7]:
data_collect(dagger_learner, env, 25, 'numpy_data/dagger', device)

In [None]:
import matplotlib.pyplot as plt

def plot(values, label):
    plt.plot(np.arange(len(values)), values)
    plt.ylabel(label)
    plt.xlabel('Epoch')
    plt.show()

## TRAIN DQN w/ ATARI

In [10]:
from dqn import DQN
import dqn

INPUT_SHAPE = 210*160
ACTION_SIZE = env.action_space.n

atari_dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

dqn.train(atari_dqn_learner, env, observations=atari_obs, actions=atari_act, rewards=atari_rew, next_observations=atari_next, dones=atari_done, save_path='models/atari_dqn.pth')

  0%|          | 0/100 [00:12<?, ?it/s]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (128,) + inhomogeneous part.

In [None]:
plot(atari_dqn_learner.test_loss, 'Loss')

In [None]:
plot(atari_dqn_learner.test_scores, 'Score')

## TRAIN DQN w/ BC

In [None]:
bc_obs = np.load('numpy_data/bc/observations.npy')
bc_act = np.load('numpy_data/bc/actions.npy')
bc_rew = np.load('numpy_data/bc/rewards.npy')
bc_done = np.load('numpy_data/bc/dones.npy')
bc_next = np.load('numpy_data/bc/next_observations.npy')

bc_dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

dqn.train(bc_dqn_learner, env, observations=bc_obs, actions=bc_act, rewards=bc_rew, next_observations=bc_next, dones=bc_done, save_path='models/bc_dqn.pth')

In [None]:
plot(bc_dqn_learner.test_loss, 'Loss')

In [None]:
plot(bc_dqn_learner.test_scores, 'Score')

## TRAIN DQN w/ DAgger

In [None]:
dagger_obs = np.load('numpy_data/dagger/observations.npy')
dagger_act = np.load('numpy_data/dagger/actions.npy')
dagger_rew = np.load('numpy_data/dagger/rewards.npy')
dagger_done = np.load('numpy_data/dagger/dones.npy')
dagger_next = np.load('numpy_data/dagger/next_observations.npy')

dagger_dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

dqn.train(dagger_dqn_learner, env, observations=dagger_obs, actions=dagger_act, rewards=dagger_rew, next_observations=dagger_done, dones=dagger_next, save_path='models/dagger_dqn.pth')

In [None]:
plot(dagger_dqn_learner.test_loss, 'Loss')

In [None]:
plot(dagger_dqn_learner.test_scores, 'Score')

## TEST DQN

In [None]:
avg_scores = []

In [None]:
dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

dqn_learner.load_state_dict(torch.load('models/atari_dqn.pth'), strict=True)

total_learner_reward = []
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(50):
    obs = env.reset()
    done = False
    sum_reward = 0
    while not done:
        with torch.no_grad():
            action = dqn_learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        sum_reward += reward
        if done:
            break
    total_learner_reward += [sum_reward]

atari_mean = np.mean(total_learner_reward)
avg_scores.append(('AGC DQN', atari_mean))

In [None]:
dqn_learner.load_state_dict(torch.load('models/bc_dqn.pth'), strict=True)

total_learner_reward = []
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(50):
    obs = env.reset()
    done = False
    sum_reward = 0
    while not done:
        with torch.no_grad():
            action = dqn_learner.get_action(torch.Tensor([obs]).to(device), eps=0.0)
        obs, reward, done, info = env.step(action)
        sum_reward += reward
        if done:
            break
    total_learner_reward += [sum_reward]

bc_dqn_mean = np.mean(total_learner_reward)
avg_scores.append(('BC DQN', bc_dqn_mean))

In [None]:
dqn_learner.load_state_dict(torch.load('models/dagger_dqn.pth'), strict=True)

total_learner_reward = []
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(50):
    obs = env.reset()
    done = False
    sum_reward = 0
    while not done:
        with torch.no_grad():
            action = dqn_learner.get_action(torch.Tensor([obs]).to(device), eps=0.0)
        obs, reward, done, info = env.step(action)
        sum_reward += reward
        if done:
            break
    total_learner_reward += [sum_reward]

dagger_dqn_mean = np.mean(total_learner_reward)
avg_scores.append(('DAgger DQN', dagger_dqn_mean))

In [None]:
total_learner_reward = []
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(50):
    obs = env.reset()
    done = False
    sum_reward = 0
    while not done:
        with torch.no_grad():
            action = bc_learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        sum_reward += reward
        if done:
            break
    total_learner_reward += [sum_reward]

bc_mean = np.mean(total_learner_reward)
avg_scores.append(('BC', bc_mean))

In [None]:
total_learner_reward = []
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(50):
    obs = env.reset()
    done = False
    sum_reward = 0
    while not done:
        with torch.no_grad():
            action = dagger_learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        sum_reward += reward
        if done:
            break
    total_learner_reward += [sum_reward]

bc_mean = np.mean(total_learner_reward)
avg_scores.append(('DAgger', bc_mean))

In [None]:
names = [item[0] for item in avg_scores]
scores = [item[1] for item in avg_scores]

# Plotting the bar graph
plt.bar(names, scores, color='skyblue')

# Adding labels and title
plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Average scores over 50 runs')

# Display the plot
plt.show()

