In [1]:
from dataloader import AtariDataset
import gym
import torch.nn as nn
import torch
import numpy as np
import random
import tqdm
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import optimizer
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

## SEEDING

In [2]:
def reseed(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
seed = 42
reseed(seed)

## LOAD DATA

In [3]:
dataloader = AtariDataset("atari_v1")
observations, actions, rewards, next_observations, dones = dataloader.compile_data()

1
[1960]


## MAKE ENVIRONMENT

In [4]:
def make_env(env_id, seed=25):
    env = gym.make(env_id, obs_type='grayscale', render_mode=None, repeat_action_probability=0.15,frameskip=1)
    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env
env = make_env("SpaceInvaders-v0", seed=seed)
print(env.action_space.n)
print(env.observation_space.shape)




6
(210, 160)


A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


# TRAIN DQN (TEST)

In [6]:

from dqn import DQN
import dqn

INPUT_SHAPE = 210*160
ACTION_SIZE = env.action_space.n

dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

dqn.train(dqn_learner, env, observations=observations, actions=actions, rewards=rewards, next_observations=next_observations, dones=dones, save_path='models/dqn_test.pth')

  0%|          | 0/100 [00:09<?, ?it/s]


KeyboardInterrupt: 

## Train BC

In [None]:
from bc import SpaceInvLearner
import bc

learner = SpaceInvLearner(env)

bc.train(learner=learner, observations=observations, checkpoint_path="models/bc_learner.pth", actions=actions, num_epochs=40)

Training the learner
Training for 40 epochs


  2%|▎         | 1/40 [00:18<11:53, 18.30s/it]

Epoch 0, Loss: 0.2328628909548205


  5%|▌         | 2/40 [00:36<11:29, 18.14s/it]

Epoch 1, Loss: 0.09765370158450021


  8%|▊         | 3/40 [00:53<10:57, 17.77s/it]

Epoch 2, Loss: 0.08236520489168753


 10%|█         | 4/40 [01:10<10:21, 17.26s/it]

Epoch 3, Loss: 0.07443511830046087


 12%|█▎        | 5/40 [01:26<09:54, 16.98s/it]

Epoch 4, Loss: 0.0692666074950839


 15%|█▌        | 6/40 [01:43<09:31, 16.80s/it]

Epoch 5, Loss: 0.06509735768685687


 18%|█▊        | 7/40 [01:59<09:11, 16.71s/it]

Epoch 6, Loss: 0.06211118761202208


 20%|██        | 8/40 [02:16<08:52, 16.65s/it]

Epoch 7, Loss: 0.059370186522892825


 22%|██▎       | 9/40 [02:32<08:36, 16.66s/it]

Epoch 8, Loss: 0.056966968130095344


 25%|██▌       | 10/40 [02:49<08:22, 16.76s/it]

Epoch 9, Loss: 0.05515189998744088


 28%|██▊       | 11/40 [03:07<08:13, 17.00s/it]

Epoch 10, Loss: 0.05301044978300527


 30%|███       | 12/40 [03:25<08:06, 17.37s/it]

Epoch 11, Loss: 0.05154143571335071


 32%|███▎      | 13/40 [03:42<07:43, 17.18s/it]

Epoch 12, Loss: 0.050305204287308634


 35%|███▌      | 14/40 [04:00<07:36, 17.56s/it]

Epoch 13, Loss: 0.048640288734539745


 38%|███▊      | 15/40 [04:17<07:12, 17.30s/it]

Epoch 14, Loss: 0.047475668442041874


 40%|████      | 16/40 [04:34<06:50, 17.09s/it]

Epoch 15, Loss: 0.04637358335841032


 42%|████▎     | 17/40 [04:50<06:28, 16.89s/it]

Epoch 16, Loss: 0.04509320502866254


 45%|████▌     | 18/40 [05:07<06:11, 16.91s/it]

Epoch 17, Loss: 0.044029522883946966


 48%|████▊     | 19/40 [05:24<05:53, 16.85s/it]

Epoch 18, Loss: 0.04287427127895116


 50%|█████     | 20/40 [05:41<05:37, 16.89s/it]

Epoch 19, Loss: 0.04217651392170204


 52%|█████▎    | 21/40 [05:57<05:18, 16.77s/it]

Epoch 20, Loss: 0.04118504779411436


 55%|█████▌    | 22/40 [06:14<05:01, 16.74s/it]

Epoch 21, Loss: 0.04024586619456568


 57%|█████▊    | 23/40 [06:30<04:43, 16.65s/it]

Epoch 22, Loss: 0.03981901526374851


 60%|██████    | 24/40 [06:47<04:27, 16.73s/it]

Epoch 23, Loss: 0.038817869599454845


 62%|██████▎   | 25/40 [07:04<04:09, 16.66s/it]

Epoch 24, Loss: 0.0381938560814916


 65%|██████▌   | 26/40 [07:20<03:52, 16.58s/it]

Epoch 25, Loss: 0.037224467491755835


 68%|██████▊   | 27/40 [07:37<03:35, 16.58s/it]

Epoch 26, Loss: 0.03671300086403454


 70%|███████   | 28/40 [07:54<03:23, 16.98s/it]

Epoch 27, Loss: 0.03613608223755015


 72%|███████▎  | 29/40 [08:13<03:10, 17.34s/it]

Epoch 28, Loss: 0.03564826463161374


 75%|███████▌  | 30/40 [08:32<02:57, 17.79s/it]

Epoch 29, Loss: 0.03460118074141664


 78%|███████▊  | 31/40 [08:50<02:40, 17.89s/it]

Epoch 30, Loss: 0.03443758685694685


 80%|████████  | 32/40 [09:07<02:20, 17.61s/it]

Epoch 31, Loss: 0.03386444329452856


 82%|████████▎ | 33/40 [09:23<02:01, 17.39s/it]

Epoch 32, Loss: 0.03322192889198798


 85%|████████▌ | 34/40 [09:41<01:44, 17.39s/it]

Epoch 33, Loss: 0.03271194785238533


 88%|████████▊ | 35/40 [09:57<01:25, 17.11s/it]

Epoch 34, Loss: 0.03248046584419731


 90%|█████████ | 36/40 [10:14<01:07, 16.98s/it]

Epoch 35, Loss: 0.03174151636866


 92%|█████████▎| 37/40 [10:31<00:50, 16.94s/it]

Epoch 36, Loss: 0.03136699610121907


 95%|█████████▌| 38/40 [10:49<00:34, 17.18s/it]

Epoch 37, Loss: 0.030783186944443715


 98%|█████████▊| 39/40 [11:08<00:17, 17.76s/it]

Epoch 38, Loss: 0.030496925144869083


100%|██████████| 40/40 [11:26<00:00, 17.16s/it]

Epoch 39, Loss: 0.029993618590682898





SpaceInvLearner(
  (fc1): Linear(in_features=33600, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (fc_out): Linear(in_features=256, out_features=6, bias=True)
)

In [None]:
learner.load_state_dict(torch.load("models/bc_learner.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(20):
    obs = env.reset()
    done = False
    while not done:
        with torch.no_grad():
            action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        total_learner_reward += reward
        if done:
            break

print(total_learner_reward/20)

  action = learner.get_action(torch.Tensor([obs]).to(device))


186.0


## LOAD EXPERT

In [None]:
from expert.ppo import PPOAgent, ActorCnn, CriticCnn

INPUT_SHAPE = (4, 84, 84)
ACTION_SIZE = env.action_space.n
SEED = 0
GAMMA = 0.99           # discount factor
ALPHA= 0.00001         # Actor learning rate
BETA = 0.00001          # Critic learning rate
TAU = 0.95
BATCH_SIZE = 64
PPO_EPOCH = 10
CLIP_PARAM = 0.2
UPDATE_EVERY = 1000    # how often to update the network 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = PPOAgent(INPUT_SHAPE, ACTION_SIZE, SEED, device, GAMMA, ALPHA, BETA, TAU, UPDATE_EVERY, BATCH_SIZE, PPO_EPOCH, CLIP_PARAM, ActorCnn(INPUT_SHAPE, ACTION_SIZE), CriticCnn(INPUT_SHAPE))
agent.load_model("models/expert_actor.pth", device)

## DAgger Implementation

In [None]:
import dagger

dagger.interact(env, learner, agent, observations=[], actions=[], checkpoint_path="models/DAgger.pth", seed=seed, num_epochs=40, tqdm_disable=True)

After interaction 0, reward = 160.0
Training the learner
Training for 40 epochs
Epoch 0, Loss: 0.09196897419218451
Epoch 1, Loss: 0.023473032517533327
Epoch 2, Loss: 0.013229532919080743
Epoch 3, Loss: 0.00898487860300569
Epoch 4, Loss: 0.006821882086250559
Epoch 5, Loss: 0.0057289537740347314
Epoch 6, Loss: 0.00484704977706276
Epoch 7, Loss: 0.00433736169568926
Epoch 8, Loss: 0.00392468279840345
Epoch 9, Loss: 0.0036411496291156448
Epoch 10, Loss: 0.003362798730235628
Epoch 11, Loss: 0.003148196141088112
Epoch 12, Loss: 0.0029302238041656586
Epoch 13, Loss: 0.0027873978977331253
Epoch 14, Loss: 0.002602497859374499
Epoch 15, Loss: 0.002486137566886792
Epoch 16, Loss: 0.00235297215092209
Epoch 17, Loss: 0.0022793096824341636
Epoch 18, Loss: 0.0022279966772979004
Epoch 19, Loss: 0.002104322397026435
Epoch 20, Loss: 0.00195839288265466
Epoch 21, Loss: 0.0019096977975036438
Epoch 22, Loss: 0.0018380036446931705
Epoch 23, Loss: 0.00181110089767755
Epoch 24, Loss: 0.0017506486854312658
Epoc

: 

In [None]:
learner.load_state_dict(torch.load("models/DAgger.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(20):
    obs = env.reset()
    done = False
    while not done:
        with torch.no_grad():
            action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        total_learner_reward += reward

print(total_learner_reward/20)

118.25
