In [1]:
from dataloader import AtariDataset
import gym
import torch.nn as nn
import torch
import numpy as np
import random
import tqdm
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import optimizer
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
import cv2

## SEEDING

In [2]:
def reseed(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
seed = 42
reseed(seed)


## LOAD DATA

In [3]:
dataloader = AtariDataset("atari_v1", 2)
observations, actions, rewards, next_observations, dones = dataloader.compile_data()

3
[1960, 1870, 1770]


## MAKE ENVIRONMENT

In [4]:
def make_env(env_id, seed=25):
    env = gym.make(env_id, obs_type='grayscale', render_mode='rgb_array', repeat_action_probability=0.15,frameskip=1)
    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env
env = make_env("SpaceInvaders-v0", seed=seed)
print(env.action_space.n)
print(env.observation_space.shape)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

6
(210, 160)
Device:  cuda:0


In [5]:
def visualize(learner, env, video_name="test"):
    """Visualize a policy network for a given algorithm on a single episode

        Args:
            algorithm (PolicyGradient): Algorithm whose policy network will be rolled out for the episode. If
            no algorithm is passed in, a random policy will be visualized.
            video_name (str): Name for the mp4 file of the episode that will be saved (omit .mp4). Only used
            when running on local machine.
    """

    import cv2

    print("Visualizing")

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    video = cv2.VideoWriter(f"{video_name}.avi", fourcc, 24, (160,210), isColor = True)
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)

        total_reward += reward

        if done:
            break

        im = env.render(mode='rgb_array')
        
        video.write(im)

    video.release()
    env.close()
    print(f"Video saved as {video_name}.avi")
    print("Reward: " + str(total_reward))

# TRAIN DQN (TEST)

In [6]:

# from dqn import DQN
# import dqn

# INPUT_SHAPE = (210, 160)
# ACTION_SIZE = env.action_space.n

# dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

# dqn.train(dqn_learner, env, observations=observations, actions=actions, rewards=rewards, next_observations=next_observations, dones=dones, save_path='models/dqn_test.pth')

## Train BC

In [7]:
from bc import SpaceInvLearner
import bc

learner = SpaceInvLearner(env)

bc.train(learner=learner, observations=observations, checkpoint_path="models/bc_learner.pth", actions=actions, num_epochs=100)

Training the learner
Training for 100 epochs


  1%|          | 1/100 [00:15<26:00, 15.76s/it]

Epoch 0, Loss: 0.30219602271150975


  2%|▏         | 2/100 [00:31<25:22, 15.53s/it]

Epoch 1, Loss: 0.15456958267999046


  3%|▎         | 3/100 [00:46<24:54, 15.41s/it]

Epoch 2, Loss: 0.14067100293862336


In [None]:
learner.load_state_dict(torch.load("models/bc_learner.pth"), strict=True)
total_learner_reward = 0
done = False

# for i in range(20):
#     obs = env.reset()
#     done = False
#     while not done:
#         with torch.no_grad():
#             action = learner.get_action(torch.Tensor([obs]).to(device))
#         obs, reward, done, info = env.step(action)
#         total_learner_reward += reward
#         if done:
#             break

# print(total_learner_reward/20)

visualize(learner, env, "bc_learner")

Visualizing


  action = learner.get_action(torch.Tensor([obs]).to(device))


Video saved as bc_learner.avi
Reward: 0.0


## LOAD EXPERT

In [None]:
from expert.ppo import PPOAgent, ActorCnn, CriticCnn

INPUT_SHAPE = (4, 84, 84)
ACTION_SIZE = env.action_space.n
SEED = 0
GAMMA = 0.99           # discount factor
ALPHA= 0.00001         # Actor learning rate
BETA = 0.00001          # Critic learning rate
TAU = 0.95
BATCH_SIZE = 64
PPO_EPOCH = 10
CLIP_PARAM = 0.2
UPDATE_EVERY = 1000    # how often to update the network 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = PPOAgent(INPUT_SHAPE, ACTION_SIZE, SEED, device, GAMMA, ALPHA, BETA, TAU, UPDATE_EVERY, BATCH_SIZE, PPO_EPOCH, CLIP_PARAM, ActorCnn(INPUT_SHAPE, ACTION_SIZE), CriticCnn(INPUT_SHAPE))
agent.load_model("models/expert_actor.pth", device)

## DAgger Implementation

In [None]:
import dagger

dagger.interact(env, learner, agent, observations=[], actions=[], checkpoint_path="models/DAgger.pth", seed=seed, num_epochs=40, tqdm_disable=True)

After interaction 0, reward = 0.0
Training the learner
Training for 40 epochs
Epoch 0, Loss: 0.35487094736701746
Epoch 1, Loss: 0.2857381035274547
Epoch 2, Loss: 0.2684738138952841
Epoch 3, Loss: 0.2645116412467475
Epoch 4, Loss: 0.26340705111138657
Epoch 5, Loss: 0.26341036870161116
Epoch 6, Loss: 0.2624255468914225
Epoch 7, Loss: 0.25903930108900103
Epoch 8, Loss: 0.25154174764233805
Epoch 9, Loss: 0.2485266792322324
Epoch 10, Loss: 0.24379143885972268
Epoch 11, Loss: 0.23559911291952168
Epoch 12, Loss: 0.22538839310730407
Epoch 13, Loss: 0.21524493642662407
Epoch 14, Loss: 0.19536946695634175
Epoch 15, Loss: 0.16678819236772585
Epoch 16, Loss: 0.15229528098760528
Epoch 17, Loss: 0.1451131922566073
Epoch 18, Loss: 0.14221671982147202
Epoch 19, Loss: 0.14242760058129308
Epoch 20, Loss: 0.14339944529834636
Epoch 21, Loss: 0.13985652510439875
Epoch 22, Loss: 0.13796875419814664
Epoch 23, Loss: 0.13630981119315977
Epoch 24, Loss: 0.1351237023027365
Epoch 25, Loss: 0.1351900327722088
Epoc

In [None]:
learner.load_state_dict(torch.load("models/DAgger.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(20):
    obs = env.reset()
    done = False
    while not done:
        with torch.no_grad():
            action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        total_learner_reward += reward

print(total_learner_reward/20)

118.25
