In [1]:
from dataloader import AtariDataset
import gym
import torch.nn as nn
import torch
import numpy as np
import random
import tqdm
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import optimizer
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
import cv2

## SEEDING

In [2]:
def reseed(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
seed = 42
reseed(seed)


## LOAD DATA

In [3]:
dataloader = AtariDataset("atari_v1", 15)
observations, actions, rewards, next_observations, dones = dataloader.compile_data()

15
[1960, 1870, 1770, 1705, 1700, 1685, 1665, 1660, 1660, 1605, 1605, 1580, 1525, 1490, 1470]


## MAKE ENVIRONMENT

In [4]:
def make_env(env_id, seed=25):
    env = gym.make(env_id, obs_type='grayscale', render_mode='rgb_array', repeat_action_probability=0.15,frameskip=1)
    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env
env = make_env("SpaceInvaders-v0", seed=seed)
print(env.action_space.n)
print(env.observation_space.shape)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

6
(210, 160)
Device:  cuda:0


In [5]:
from dqn import DQN
import dqn


def visualize(learner, env, video_name="test"):
    """Visualize a policy network for a given algorithm on a single episode

        Args:
            algorithm (PolicyGradient): Algorithm whose policy network will be rolled out for the episode. If
            no algorithm is passed in, a random policy will be visualized.
            video_name (str): Name for the mp4 file of the episode that will be saved (omit .mp4). Only used
            when running on local machine.
    """

    import cv2

    print("Visualizing")

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    video = cv2.VideoWriter(f"{video_name}.avi", fourcc, 24, (160,210), isColor = True)
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        if isinstance(learner, DQN):
            with torch.no_grad():
              action = learner.get_action(
              torch.tensor(obs).unsqueeze(0), eps=0.0
              )
        else:
            with torch.no_grad():
              action = learner.get_action(
              torch.tensor(obs).unsqueeze(0)
              )
        obs, reward, done, info = env.step(action)

        total_reward += reward

        if done:
            break

        im = env.render(mode='rgb_array')
        
        video.write(im)

    video.release()
    env.close()
    print(f"Video saved as {video_name}.avi")
    print("Reward: " + str(total_reward))

# TRAIN DQN (TEST)

In [6]:
from dqn import DQN
import dqn

INPUT_SHAPE = 210*160
ACTION_SIZE = env.action_space.n

dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

dqn.train(dqn_learner, env, observations=observations, actions=actions, rewards=rewards, next_observations=next_observations, dones=dones, save_path='models/dqn_test.pth', num_episodes=25, lr=1e-4)

  val_action = network.get_action(torch.Tensor([val_obs]).to(device), eps=0.00)
  4%|▍         | 1/25 [00:52<21:11, 52.98s/it]

New minimum:  1570.3909033714863


  8%|▊         | 2/25 [01:39<18:55, 49.38s/it]

New minimum:  679.7369761247455


 12%|█▏        | 3/25 [02:20<16:35, 45.24s/it]

New minimum:  524.0243489350752


 24%|██▍       | 6/25 [04:34<14:00, 44.25s/it]

New minimum:  515.9805828436259


 28%|██▊       | 7/25 [05:18<13:16, 44.25s/it]

New minimum:  435.5177795996826


In [None]:
visualize(dqn_learner, env, "dqn_learner")

Visualizing
Video saved as dqn_learner.avi
Reward: 205.0


## Train BC

In [None]:
from bc import SpaceInvLearner
import bc

learner = SpaceInvLearner(env)

bc.train(learner=learner, observations=observations, checkpoint_path="models/bc_learner.pth", actions=actions, num_epochs=100)

Training the learner


KeyboardInterrupt: 

In [None]:
learner.load_state_dict(torch.load("models/bc_learner.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(20):
    obs = env.reset()
    done = False
    while not done:
        with torch.no_grad():
            action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        total_learner_reward += reward
        if done:
            break

print(total_learner_reward/20)

visualize(learner, env, "bc_learner")

Visualizing


  action = learner.get_action(torch.Tensor([obs]).to(device))


Video saved as bc_learner.avi
Reward: 80.0


## LOAD EXPERT

In [None]:
from expert.ppo import PPOAgent, ActorCnn, CriticCnn

INPUT_SHAPE = (4, 84, 84)
ACTION_SIZE = env.action_space.n
SEED = 0
GAMMA = 0.99           # discount factor
ALPHA= 0.00001         # Actor learning rate
BETA = 0.00001          # Critic learning rate
TAU = 0.95
BATCH_SIZE = 64
PPO_EPOCH = 10
CLIP_PARAM = 0.2
UPDATE_EVERY = 1000    # how often to update the network 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = PPOAgent(INPUT_SHAPE, ACTION_SIZE, SEED, device, GAMMA, ALPHA, BETA, TAU, UPDATE_EVERY, BATCH_SIZE, PPO_EPOCH, CLIP_PARAM, ActorCnn(INPUT_SHAPE, ACTION_SIZE), CriticCnn(INPUT_SHAPE))
agent.load_model("models/expert_actor.pth", device)

## DAgger Implementation

In [None]:
import dagger

dagger.interact(env, learner, agent, observations=[], actions=[], checkpoint_path="models/DAgger.pth", seed=seed, num_epochs=40, tqdm_disable=True)

After interaction 0, reward = 80.0
Training the learner
Training for 40 epochs
Epoch 0, Loss: 0.2285896447943706
Epoch 1, Loss: 0.16880559284313051
Epoch 2, Loss: 0.15796418084817773
Epoch 3, Loss: 0.15377965417562747
Epoch 4, Loss: 0.1483202627476524
Epoch 5, Loss: 0.14525875747203826
Epoch 6, Loss: 0.14375673450675666
Epoch 7, Loss: 0.14360926916786268
Epoch 8, Loss: 0.14317754314226264
Epoch 9, Loss: 0.14113674181349137
Epoch 10, Loss: 0.13915853751640694
Epoch 11, Loss: 0.13925927380720773
Epoch 12, Loss: 0.13707467609760807
Epoch 13, Loss: 0.13588255976929384
Epoch 14, Loss: 0.13434492244439966
Epoch 15, Loss: 0.13368839709197775
Epoch 16, Loss: 0.1337788970447054
Epoch 17, Loss: 0.13292337802110935
Epoch 18, Loss: 0.1343384880061243
Epoch 19, Loss: 0.13505621949831645
Epoch 20, Loss: 0.13200805456030604
Epoch 21, Loss: 0.13258715593347362
Epoch 22, Loss: 0.1319811149555094
Epoch 23, Loss: 0.13122459562385783
Epoch 24, Loss: 0.12987088315627154
Epoch 25, Loss: 0.12836752119017583


KeyboardInterrupt: 

In [None]:
learner.load_state_dict(torch.load("models/DAgger.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(20):
    obs = env.reset()
    done = False
    while not done:
        with torch.no_grad():
            action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        total_learner_reward += reward

print(total_learner_reward/20)

visualize(learner, env, "dagger_learner.avi")

118.25
