In [1]:
from dataloader import AtariDataset
import gym
import torch.nn as nn
import torch
import numpy as np
import random
import tqdm
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import optimizer
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
import cv2

## SEEDING

In [2]:
def reseed(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
seed = 42
reseed(seed)


## LOAD DATA

In [3]:
dataloader = AtariDataset("atari_v1", 15)
observations, actions, rewards, next_observations, dones = dataloader.compile_data()

15
[1960, 1870, 1770, 1705, 1700, 1685, 1665, 1660, 1660, 1605, 1605, 1580, 1525, 1490, 1470]


## MAKE ENVIRONMENT

In [4]:
def make_env(env_id, seed=25):
    env = gym.make(env_id, obs_type='grayscale', render_mode='rgb_array', repeat_action_probability=0.15,frameskip=1)
    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env
env = make_env("SpaceInvaders-v0", seed=seed)
print(env.action_space.n)
print(env.observation_space.shape)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


6
(210, 160)
Device:  cuda:0


In [5]:
def visualize(learner, env, video_name="test"):
    """Visualize a policy network for a given algorithm on a single episode

        Args:
            algorithm (PolicyGradient): Algorithm whose policy network will be rolled out for the episode. If
            no algorithm is passed in, a random policy will be visualized.
            video_name (str): Name for the mp4 file of the episode that will be saved (omit .mp4). Only used
            when running on local machine.
    """

    import cv2

    print("Visualizing")

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    video = cv2.VideoWriter(f"{video_name}.avi", fourcc, 24, (160,210), isColor = True)
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)

        total_reward += reward

        if done:
            break

        im = env.render(mode='rgb_array')
        
        video.write(im)

    video.release()
    env.close()
    print(f"Video saved as {video_name}.avi")
    print("Reward: " + str(total_reward))

## Train BC

In [6]:
from bc import SpaceInvLearner
import bc

learner = SpaceInvLearner(env)

# bc.train(learner=learner, observations=observations, checkpoint_path="models/bc_learner.pth", actions=actions, num_epochs=25)

In [21]:
learner.load_state_dict(torch.load("models/bc_learner.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


visualize(learner, env, "bc_learner")

Visualizing
Video saved as bc_learner.avi
Reward: 120.0


## LOAD EXPERT

In [8]:
from expert.ppo import PPOAgent, ActorCnn, CriticCnn

INPUT_SHAPE = (4, 84, 84)
ACTION_SIZE = env.action_space.n
SEED = 0
GAMMA = 0.99           # discount factor
ALPHA= 0.00001         # Actor learning rate
BETA = 0.00001          # Critic learning rate
TAU = 0.95
BATCH_SIZE = 64
PPO_EPOCH = 10
CLIP_PARAM = 0.2
UPDATE_EVERY = 1000    # how often to update the network 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = PPOAgent(INPUT_SHAPE, ACTION_SIZE, SEED, device, GAMMA, ALPHA, BETA, TAU, UPDATE_EVERY, BATCH_SIZE, PPO_EPOCH, CLIP_PARAM, ActorCnn(INPUT_SHAPE, ACTION_SIZE), CriticCnn(INPUT_SHAPE))
agent.load_model("models/expert_actor.pth", device)

## DAgger Implementation

In [9]:
import dagger

dagger.interact(env, learner, agent, observations=observations, actions=actions, checkpoint_path="models/DAgger.pth", seed=seed, num_epochs=25, tqdm_disable=True)

After interaction 0, reward = 120.0
Training the learner
Training for 25 epochs
Epoch 0, Loss: 0.04350557899531346
Epoch 1, Loss: 0.042649096414284264
Epoch 2, Loss: 0.042080336482019934
Epoch 3, Loss: 0.041386895192833166
Epoch 4, Loss: 0.04097874933073133
Epoch 5, Loss: 0.04056245065371453
Epoch 6, Loss: 0.03996931577004949
Epoch 7, Loss: 0.03974763405149023
Epoch 8, Loss: 0.03914307540737914
Epoch 9, Loss: 0.03867110315499403
Epoch 10, Loss: 0.038284687184440676
Epoch 11, Loss: 0.037936118146633724
Epoch 12, Loss: 0.03745183003768604
Epoch 13, Loss: 0.03703958397493739
Epoch 14, Loss: 0.03677545884181205
Epoch 15, Loss: 0.03638671566813788
Epoch 16, Loss: 0.036022508853681946
Epoch 17, Loss: 0.03569208938625805
Epoch 18, Loss: 0.03547194669099109
Epoch 19, Loss: 0.03531559657425425
Epoch 20, Loss: 0.03473373544232266
Epoch 21, Loss: 0.03452082235352728
Epoch 22, Loss: 0.03418208397797346
Epoch 23, Loss: 0.03396151494406617
Epoch 24, Loss: 0.03367556526325791
After interaction 1, rew

KeyboardInterrupt: 

In [27]:
learner.load_state_dict(torch.load("models/DAgger.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

visualize(learner, env, "dagger_learner")

Visualizing
Video saved as dagger_learner.avi
Reward: 190.0
