In [1]:
from dataloader import AtariDataset
import gym
import torch.nn as nn
import torch
import numpy as np
import random
import tqdm
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import optimizer
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

## SEEDING

In [2]:
def reseed(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
seed = 42
reseed(seed)

## LOAD DATA

In [3]:
dataloader = AtariDataset("atari_v1")
observations, actions, rewards, next_observations, dones = dataloader.compile_data()

5
[1960, 1870, 1770, 1705, 1700]


ValueError: too many values to unpack (expected 2)

## MAKE ENVIRONMENT

In [None]:
def make_env(env_id, seed=25):
    env = gym.make(env_id, obs_type='grayscale', render_mode=None, repeat_action_probability=0.15,frameskip=1)
    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env
env = make_env("SpaceInvaders-v0", seed=seed)
print(env.action_space.n)
print(env.observation_space.shape)




6
(210, 160)


A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


# TRAIN DQN (TEST)

In [None]:

from dqn import DQN
import dqn

INPUT_SHAPE = (210, 160)
ACTION_SIZE = env.action_space.n

dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

dqn.train(dqn_learner, env, observations=observations, actions=actions, rewards=rewards, next_observations=next_observations, dones=dones, save_path='models/dqn_test.pth')

## Train BC

In [None]:
from bc import SpaceInvLearner
import bc

learner = SpaceInvLearner(env)

bc.train(learner=learner, observations=observations, checkpoint_path="models/bc_learner.pth", actions=actions, num_epochs=40)

Training the learner
Training for 40 epochs


  2%|▎         | 1/40 [00:19<12:49, 19.73s/it]

Epoch 0, Loss: 0.2328628909548205


  5%|▌         | 2/40 [00:39<12:27, 19.67s/it]

Epoch 1, Loss: 0.09765370158450021


  8%|▊         | 3/40 [00:57<11:36, 18.81s/it]

Epoch 2, Loss: 0.08236520489168753


 10%|█         | 4/40 [01:14<11:01, 18.39s/it]

Epoch 3, Loss: 0.07443511830046087


 12%|█▎        | 5/40 [01:31<10:23, 17.81s/it]

Epoch 4, Loss: 0.0692666074950839


 15%|█▌        | 6/40 [01:48<09:50, 17.38s/it]

Epoch 5, Loss: 0.06509735768685687


 18%|█▊        | 7/40 [02:04<09:24, 17.11s/it]

Epoch 6, Loss: 0.06211118761202208


 20%|██        | 8/40 [02:21<08:58, 16.84s/it]

Epoch 7, Loss: 0.059370186522892825


 22%|██▎       | 9/40 [02:37<08:40, 16.80s/it]

Epoch 8, Loss: 0.056966968130095344


 25%|██▌       | 10/40 [02:55<08:30, 17.02s/it]

Epoch 9, Loss: 0.05515189998744088


 28%|██▊       | 11/40 [03:12<08:13, 17.03s/it]

Epoch 10, Loss: 0.05301044978300527


 30%|███       | 12/40 [03:30<08:10, 17.53s/it]

Epoch 11, Loss: 0.05154143571335071


 32%|███▎      | 13/40 [03:49<07:59, 17.75s/it]

Epoch 12, Loss: 0.050305204287308634


 35%|███▌      | 14/40 [04:06<07:36, 17.54s/it]

Epoch 13, Loss: 0.048640288734539745


 38%|███▊      | 15/40 [04:23<07:13, 17.34s/it]

Epoch 14, Loss: 0.047475668442041874


 40%|████      | 16/40 [04:39<06:50, 17.11s/it]

Epoch 15, Loss: 0.04637358335841032


 42%|████▎     | 17/40 [04:56<06:29, 16.93s/it]

Epoch 16, Loss: 0.04509320502866254


 45%|████▌     | 18/40 [05:12<06:08, 16.77s/it]

Epoch 17, Loss: 0.044029522883946966


 48%|████▊     | 19/40 [05:29<05:49, 16.65s/it]

Epoch 18, Loss: 0.04287427127895116


 50%|█████     | 20/40 [05:45<05:31, 16.59s/it]

Epoch 19, Loss: 0.04217651392170204


 52%|█████▎    | 21/40 [06:01<05:14, 16.55s/it]

Epoch 20, Loss: 0.04118504779411436


 55%|█████▌    | 22/40 [06:18<04:57, 16.51s/it]

Epoch 21, Loss: 0.04024586619456568


 57%|█████▊    | 23/40 [06:34<04:40, 16.53s/it]

Epoch 22, Loss: 0.03981901526374851


 60%|██████    | 24/40 [06:51<04:24, 16.51s/it]

Epoch 23, Loss: 0.038817869599454845


 62%|██████▎   | 25/40 [07:08<04:10, 16.71s/it]

Epoch 24, Loss: 0.0381938560814916


 65%|██████▌   | 26/40 [07:25<03:56, 16.86s/it]

Epoch 25, Loss: 0.037224467491755835


 68%|██████▊   | 27/40 [07:42<03:39, 16.85s/it]

Epoch 26, Loss: 0.03671300086403454


 70%|███████   | 28/40 [07:59<03:23, 16.94s/it]

Epoch 27, Loss: 0.03613608223755015


 72%|███████▎  | 29/40 [08:17<03:07, 17.05s/it]

Epoch 28, Loss: 0.03564826463161374


 75%|███████▌  | 30/40 [08:34<02:51, 17.12s/it]

Epoch 29, Loss: 0.03460118074141664


 78%|███████▊  | 31/40 [08:51<02:34, 17.14s/it]

Epoch 30, Loss: 0.03443758685694685


 80%|████████  | 32/40 [09:08<02:16, 17.09s/it]

Epoch 31, Loss: 0.03386444329452856


 82%|████████▎ | 33/40 [09:26<02:01, 17.40s/it]

Epoch 32, Loss: 0.03322192889198798


 85%|████████▌ | 34/40 [09:43<01:44, 17.34s/it]

Epoch 33, Loss: 0.03271194785238533


 88%|████████▊ | 35/40 [10:01<01:26, 17.35s/it]

Epoch 34, Loss: 0.03248046584419731


 90%|█████████ | 36/40 [10:18<01:09, 17.44s/it]

Epoch 35, Loss: 0.03174151636866


 92%|█████████▎| 37/40 [10:35<00:51, 17.33s/it]

Epoch 36, Loss: 0.03136699610121907


 95%|█████████▌| 38/40 [10:53<00:34, 17.36s/it]

Epoch 37, Loss: 0.030783186944443715


 98%|█████████▊| 39/40 [11:10<00:17, 17.33s/it]

Epoch 38, Loss: 0.030496925144869083


100%|██████████| 40/40 [11:27<00:00, 17.20s/it]

Epoch 39, Loss: 0.029993618590682898





SpaceInvLearner(
  (fc1): Linear(in_features=33600, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (fc_out): Linear(in_features=256, out_features=6, bias=True)
)

In [None]:
learner.load_state_dict(torch.load("models/bc_learner.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(20):
    obs = env.reset()
    done = False
    while not done:
        with torch.no_grad():
            action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        total_learner_reward += reward
        if done:
            break

print(total_learner_reward/20)

127.25


## LOAD EXPERT

In [None]:
from expert.ppo import PPOAgent, ActorCnn, CriticCnn

INPUT_SHAPE = (4, 84, 84)
ACTION_SIZE = env.action_space.n
SEED = 0
GAMMA = 0.99           # discount factor
ALPHA= 0.0001          # Actor learning rate
BETA = 0.0001          # Critic learning rate
TAU = 0.95
BATCH_SIZE = 32
PPO_EPOCH = 5
CLIP_PARAM = 0.2
UPDATE_EVERY = 1000     # how often to update the network 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = PPOAgent(INPUT_SHAPE, ACTION_SIZE, SEED, device, GAMMA, ALPHA, BETA, TAU, UPDATE_EVERY, BATCH_SIZE, PPO_EPOCH, CLIP_PARAM, ActorCnn(INPUT_SHAPE, ACTION_SIZE), CriticCnn(INPUT_SHAPE))
agent.load_model("models/expert_actor.pth", device)

## DAgger Implementation

In [None]:
import dagger

dagger.interact(env, learner, agent, observations=observations, actions=actions, checkpoint_path="models/DAgger.pth", seed=seed, num_epochs=40, tqdm_disable=True)

After interaction 0, reward = 50.0
Training the learner
Training for 20 epochs
Epoch 0, Loss: 0.2291255757028674
Epoch 1, Loss: 0.17323820734465564
Epoch 2, Loss: 0.1555330808515902
Epoch 3, Loss: 0.14674009106777333
Epoch 4, Loss: 0.14217397589006542
Epoch 5, Loss: 0.13803996384879688
Epoch 6, Loss: 0.1353701772513213
Epoch 7, Loss: 0.1326952221584909
Epoch 8, Loss: 0.1309829733253997
Epoch 9, Loss: 0.12841376533478865
Epoch 10, Loss: 0.1281951579414768
Epoch 11, Loss: 0.12567432427111966
Epoch 12, Loss: 0.12410514976507352
Epoch 13, Loss: 0.12316190092651932
Epoch 14, Loss: 0.1219550437397427
Epoch 15, Loss: 0.12118014113770591
Epoch 16, Loss: 0.1200339741728924
Epoch 17, Loss: 0.11922966596888906
Epoch 18, Loss: 0.11753706652441143
Epoch 19, Loss: 0.11707328039186972
After interaction 1, reward = 135.0
Training the learner
Training for 20 epochs
Epoch 0, Loss: 0.1677919045420297
Epoch 1, Loss: 0.1543796840376021
Epoch 2, Loss: 0.1446054458709094
Epoch 3, Loss: 0.14156718197485402
Ep

In [None]:
learner.load_state_dict(torch.load("models/DAgger.pth"), strict=True)
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(20):
    obs = env.reset()
    done = False
    while not done:
        with torch.no_grad():
            action = learner.get_action(torch.Tensor([obs]).to(device))
        obs, reward, done, info = env.step(action)
        total_learner_reward += reward

print(total_learner_reward/20)

118.25


In [None]:
from dqn import DQN
import dqn

INPUT_SHAPE = (210, 160)
ACTION_SIZE = env.action_space.n

dqn_learner = DQN(INPUT_SHAPE, ACTION_SIZE)

dqn.train(dqn_learner, env, )

TypeError: train() missing 6 required positional arguments: 'observations', 'actions', 'rewards', 'next_observations', 'dones', and 'save_path'