In [1]:
from dataloader import AtariDataset
import gym
import torch.nn as nn
import torch
import numpy as np
import random
import tqdm
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import optimizer
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

## SEEDING

In [2]:
def reseed(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
seed = 42
reseed(seed)

## LOAD DATA

In [3]:
dataloader = AtariDataset("atari_v1")
observations, actions = dataloader.compile_data()

1


## MAKE ENVIRONMENT

In [4]:
def make_env(env_id, seed=25):
    env = gym.make(env_id, obs_type='grayscale', render_mode=None)
    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env
env = make_env("SpaceInvaders-v0", seed=seed)
print(env.action_space.n)
print(env.observation_space.shape)




6
(210, 160)


A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


## Train BC

In [5]:
from bc import SpaceInvLearner
import bc

learner = SpaceInvLearner(env)

bc.train(learner=learner, observations=observations, checkpoint_path="models/bc_learner.pth", actions=actions, num_epochs=10)

Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:04,  1.92it/s]

Epoch 0, Loss: 0.320985439909767


 20%|██        | 2/10 [00:00<00:03,  2.19it/s]

Epoch 1, Loss: 0.15203793015841832


 30%|███       | 3/10 [00:01<00:03,  2.28it/s]

Epoch 2, Loss: 0.11707441591602658


 40%|████      | 4/10 [00:01<00:02,  2.33it/s]

Epoch 3, Loss: 0.0897348363447287


 50%|█████     | 5/10 [00:02<00:02,  2.36it/s]

Epoch 4, Loss: 0.07815708242093747


 60%|██████    | 6/10 [00:02<00:01,  2.37it/s]

Epoch 5, Loss: 0.06733066741372029


 70%|███████   | 7/10 [00:03<00:01,  2.38it/s]

Epoch 6, Loss: 0.06035556291653814


 80%|████████  | 8/10 [00:03<00:00,  2.38it/s]

Epoch 7, Loss: 0.05644254338033301


 90%|█████████ | 9/10 [00:03<00:00,  2.39it/s]

Epoch 8, Loss: 0.05239081545728374


100%|██████████| 10/10 [00:04<00:00,  2.35it/s]

Epoch 9, Loss: 0.05056144293562604





SpaceInvLearner(
  (fc1): Linear(in_features=33600, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (fc_out): Linear(in_features=256, out_features=6, bias=True)
)

In [6]:
total_learner_reward = 0
done = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
obs = env.reset()
while not done:
    with torch.no_grad():
        action = learner.get_action(torch.Tensor([obs]).to(device))
    obs, reward, done, info = env.step(action)
    total_learner_reward += reward
    if done:
        break

print(total_learner_reward)

  action = learner.get_action(torch.Tensor([obs]).to(device))


410.0


## LOAD EXPERT

In [7]:
from expert.ppo import PPOAgent, ActorCnn, CriticCnn

INPUT_SHAPE = (4, 84, 84)
ACTION_SIZE = env.action_space.n
SEED = 0
GAMMA = 0.99           # discount factor
ALPHA= 0.0001          # Actor learning rate
BETA = 0.0001          # Critic learning rate
TAU = 0.95
BATCH_SIZE = 32
PPO_EPOCH = 5
CLIP_PARAM = 0.2
UPDATE_EVERY = 1000     # how often to update the network 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = PPOAgent(INPUT_SHAPE, ACTION_SIZE, SEED, device, GAMMA, ALPHA, BETA, TAU, UPDATE_EVERY, BATCH_SIZE, PPO_EPOCH, CLIP_PARAM, ActorCnn(INPUT_SHAPE, ACTION_SIZE), CriticCnn(INPUT_SHAPE))
agent.load_model("models/expert_actor.pth", device)

## DAgger Implementation

In [8]:
import dagger

dagger.interact(env, learner, agent, observations=[], actions=[], checkpoint_path="models/DAgger.pth", seed=seed, num_epochs=10)

After interaction 0, reward = 410.0
Training the learner
Training for 10 epochs


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0, Loss: 0.2948277564777406
Epoch 1, Loss: 0.20656341484703294


 30%|███       | 3/10 [00:00<00:00, 23.68it/s]

Epoch 2, Loss: 0.17200607228675652
Epoch 3, Loss: 0.15425679893107028
Epoch 4, Loss: 0.1446160303269999


 60%|██████    | 6/10 [00:00<00:00, 24.53it/s]

Epoch 5, Loss: 0.13549085769980465
Epoch 6, Loss: 0.1286286664913697
Epoch 7, Loss: 0.12418430659478519


100%|██████████| 10/10 [00:00<00:00, 24.65it/s]

Epoch 8, Loss: 0.12192010394686721
Epoch 9, Loss: 0.11810685396814048





After interaction 1, reward = 35.0
Training the learner
Training for 10 epochs


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0, Loss: 0.19481138471044912


 20%|██        | 2/10 [00:00<00:00, 15.55it/s]

Epoch 1, Loss: 0.1538970323998002
Epoch 2, Loss: 0.14032011478642953


 40%|████      | 4/10 [00:00<00:00, 12.29it/s]

Epoch 3, Loss: 0.13113687922563813


 60%|██████    | 6/10 [00:00<00:00, 13.90it/s]

Epoch 4, Loss: 0.1259158344704814
Epoch 5, Loss: 0.1225977922323781
Epoch 6, Loss: 0.11876644624482784


 80%|████████  | 8/10 [00:00<00:00, 14.73it/s]

Epoch 7, Loss: 0.1159073095284725


100%|██████████| 10/10 [00:00<00:00, 14.63it/s]

Epoch 8, Loss: 0.11406718360770153
Epoch 9, Loss: 0.1130205030930908





After interaction 2, reward = 20.0
Training the learner
Training for 10 epochs


 20%|██        | 2/10 [00:00<00:00, 12.83it/s]

Epoch 0, Loss: 0.1563744399148518
Epoch 1, Loss: 0.13475660156847827
Epoch 2, Loss: 0.12697476710973263


 60%|██████    | 6/10 [00:00<00:00, 12.67it/s]

Epoch 3, Loss: 0.12377025190192753
Epoch 4, Loss: 0.11976400173422116
Epoch 5, Loss: 0.11804099285653405


 80%|████████  | 8/10 [00:00<00:00, 12.68it/s]

Epoch 6, Loss: 0.11750074927168869
Epoch 7, Loss: 0.11517247386070202
Epoch 8, Loss: 0.11462897022586388


100%|██████████| 10/10 [00:00<00:00, 12.69it/s]


Epoch 9, Loss: 0.11396363987597322
After interaction 3, reward = 5.0
Training the learner
Training for 10 epochs


 30%|███       | 3/10 [00:00<00:00, 10.01it/s]

Epoch 0, Loss: 0.15595764099235884
Epoch 1, Loss: 0.13004231589478982
Epoch 2, Loss: 0.12350061741882143


 60%|██████    | 6/10 [00:00<00:00, 10.04it/s]

Epoch 3, Loss: 0.11869333573352889
Epoch 4, Loss: 0.11735922557731213
Epoch 5, Loss: 0.11487293815622723


 80%|████████  | 8/10 [00:00<00:00, 10.04it/s]

Epoch 6, Loss: 0.11507710832784396
Epoch 7, Loss: 0.11470771209491586
Epoch 8, Loss: 0.11340724568859034


100%|██████████| 10/10 [00:01<00:00,  9.99it/s]

Epoch 9, Loss: 0.11456415332118355





After interaction 4, reward = 15.0
Training the learner
Training for 10 epochs


 20%|██        | 2/10 [00:00<00:00,  8.92it/s]

Epoch 0, Loss: 0.14705232270716936
Epoch 1, Loss: 0.12282856185140258


 40%|████      | 4/10 [00:00<00:00,  8.92it/s]

Epoch 2, Loss: 0.11819980233095007
Epoch 3, Loss: 0.11583436746985672


 60%|██████    | 6/10 [00:00<00:00,  8.92it/s]

Epoch 4, Loss: 0.11585579888182042
Epoch 5, Loss: 0.11378046660909612


 80%|████████  | 8/10 [00:00<00:00,  8.87it/s]

Epoch 6, Loss: 0.11397900099250892
Epoch 7, Loss: 0.11195695546736716


100%|██████████| 10/10 [00:01<00:00,  8.87it/s]

Epoch 8, Loss: 0.11142374152484058
Epoch 9, Loss: 0.11037053228771981





After interaction 5, reward = 10.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:01,  7.16it/s]

Epoch 0, Loss: 0.1457210019004804


 20%|██        | 2/10 [00:00<00:01,  7.12it/s]

Epoch 1, Loss: 0.12570072973935997


 30%|███       | 3/10 [00:00<00:00,  7.06it/s]

Epoch 2, Loss: 0.11957560857820956


 40%|████      | 4/10 [00:00<00:00,  7.07it/s]

Epoch 3, Loss: 0.1162518713636326


 50%|█████     | 5/10 [00:00<00:00,  7.05it/s]

Epoch 4, Loss: 0.11522105856078091


 60%|██████    | 6/10 [00:00<00:00,  7.01it/s]

Epoch 5, Loss: 0.11447328192593753


 70%|███████   | 7/10 [00:00<00:00,  6.97it/s]

Epoch 6, Loss: 0.11477789159346907


 80%|████████  | 8/10 [00:01<00:00,  6.98it/s]

Epoch 7, Loss: 0.1143808558565872


 90%|█████████ | 9/10 [00:01<00:00,  6.99it/s]

Epoch 8, Loss: 0.11306933234494654


100%|██████████| 10/10 [00:01<00:00,  7.02it/s]

Epoch 9, Loss: 0.11366108051189325





After interaction 6, reward = 0.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:01,  6.30it/s]

Epoch 0, Loss: 0.13223038783058916


 20%|██        | 2/10 [00:00<00:01,  6.25it/s]

Epoch 1, Loss: 0.11984148449273913


 30%|███       | 3/10 [00:00<00:01,  6.21it/s]

Epoch 2, Loss: 0.11697846732686064


 40%|████      | 4/10 [00:00<00:00,  6.18it/s]

Epoch 3, Loss: 0.11548480228284794


 50%|█████     | 5/10 [00:00<00:00,  6.17it/s]

Epoch 4, Loss: 0.11588313382186947


 60%|██████    | 6/10 [00:00<00:00,  6.13it/s]

Epoch 5, Loss: 0.11389192148408347


 70%|███████   | 7/10 [00:01<00:00,  5.96it/s]

Epoch 6, Loss: 0.11532138088532926


 80%|████████  | 8/10 [00:01<00:00,  6.03it/s]

Epoch 7, Loss: 0.1143416247309826


 90%|█████████ | 9/10 [00:01<00:00,  6.06it/s]

Epoch 8, Loss: 0.11290893868427973


100%|██████████| 10/10 [00:01<00:00,  6.11it/s]

Epoch 9, Loss: 0.11329395243894255





After interaction 7, reward = 75.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:01,  5.33it/s]

Epoch 0, Loss: 0.1296674990715282


 20%|██        | 2/10 [00:00<00:01,  5.26it/s]

Epoch 1, Loss: 0.12102313989002478


 30%|███       | 3/10 [00:00<00:01,  5.25it/s]

Epoch 2, Loss: 0.11971118546952114


 40%|████      | 4/10 [00:00<00:01,  5.25it/s]

Epoch 3, Loss: 0.11575364301678659


 50%|█████     | 5/10 [00:00<00:00,  5.21it/s]

Epoch 4, Loss: 0.11691281799587717


 60%|██████    | 6/10 [00:01<00:00,  5.20it/s]

Epoch 5, Loss: 0.11544364360707789


 70%|███████   | 7/10 [00:01<00:00,  5.22it/s]

Epoch 6, Loss: 0.11599167387201527


 90%|█████████ | 9/10 [00:01<00:00,  5.13it/s]

Epoch 7, Loss: 0.11474122748659939
Epoch 8, Loss: 0.11382556544503496


100%|██████████| 10/10 [00:01<00:00,  5.18it/s]


Epoch 9, Loss: 0.11370569258136635
After interaction 8, reward = 140.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:02,  4.34it/s]

Epoch 0, Loss: 0.13312938502208074


 20%|██        | 2/10 [00:00<00:01,  4.30it/s]

Epoch 1, Loss: 0.12094090911244113


 30%|███       | 3/10 [00:00<00:01,  4.29it/s]

Epoch 2, Loss: 0.11841678721953905


 40%|████      | 4/10 [00:00<00:01,  4.27it/s]

Epoch 3, Loss: 0.11522584958854544


 50%|█████     | 5/10 [00:01<00:01,  4.26it/s]

Epoch 4, Loss: 0.11570525572359937


 60%|██████    | 6/10 [00:01<00:00,  4.26it/s]

Epoch 5, Loss: 0.114921571354946


 70%|███████   | 7/10 [00:01<00:00,  4.28it/s]

Epoch 6, Loss: 0.11547325955538547


 80%|████████  | 8/10 [00:01<00:00,  4.28it/s]

Epoch 7, Loss: 0.11337948081836594


 90%|█████████ | 9/10 [00:02<00:00,  4.28it/s]

Epoch 8, Loss: 0.11426795659304331


100%|██████████| 10/10 [00:02<00:00,  4.27it/s]

Epoch 9, Loss: 0.11379741790888612





After interaction 9, reward = 40.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:02,  4.18it/s]

Epoch 0, Loss: 0.12350445818271999


 20%|██        | 2/10 [00:00<00:01,  4.11it/s]

Epoch 1, Loss: 0.11421613454066586


 30%|███       | 3/10 [00:00<00:01,  4.08it/s]

Epoch 2, Loss: 0.11209200151931337


 40%|████      | 4/10 [00:00<00:01,  4.06it/s]

Epoch 3, Loss: 0.11202685080983919


 50%|█████     | 5/10 [00:01<00:01,  4.07it/s]

Epoch 4, Loss: 0.11249435509142036


 60%|██████    | 6/10 [00:01<00:00,  4.08it/s]

Epoch 5, Loss: 0.11058600147760567


 70%|███████   | 7/10 [00:01<00:00,  4.07it/s]

Epoch 6, Loss: 0.10970523199100098


 80%|████████  | 8/10 [00:01<00:00,  4.06it/s]

Epoch 7, Loss: 0.10867280701327348


 90%|█████████ | 9/10 [00:02<00:00,  4.05it/s]

Epoch 8, Loss: 0.10871212254514932


100%|██████████| 10/10 [00:02<00:00,  3.97it/s]

Epoch 9, Loss: 0.10891932526237649





After interaction 10, reward = 30.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:02,  3.67it/s]

Epoch 0, Loss: 0.12166472376986621


 20%|██        | 2/10 [00:00<00:02,  3.65it/s]

Epoch 1, Loss: 0.11190725406117193


 30%|███       | 3/10 [00:00<00:01,  3.64it/s]

Epoch 2, Loss: 0.11093905222376967


 40%|████      | 4/10 [00:01<00:01,  3.61it/s]

Epoch 3, Loss: 0.11030289929623556


 50%|█████     | 5/10 [00:01<00:01,  3.63it/s]

Epoch 4, Loss: 0.11150985495787749


 60%|██████    | 6/10 [00:01<00:01,  3.63it/s]

Epoch 5, Loss: 0.10937028113958458


 70%|███████   | 7/10 [00:01<00:00,  3.63it/s]

Epoch 6, Loss: 0.11115841904408295


 80%|████████  | 8/10 [00:02<00:00,  3.62it/s]

Epoch 7, Loss: 0.10961369245766318


 90%|█████████ | 9/10 [00:02<00:00,  3.62it/s]

Epoch 8, Loss: 0.10918076456354547


100%|██████████| 10/10 [00:02<00:00,  3.62it/s]

Epoch 9, Loss: 0.10837615582377019





After interaction 11, reward = 75.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:02,  3.40it/s]

Epoch 0, Loss: 0.1199425230277618


 20%|██        | 2/10 [00:00<00:02,  3.37it/s]

Epoch 1, Loss: 0.11033309092455128


 30%|███       | 3/10 [00:00<00:02,  3.36it/s]

Epoch 2, Loss: 0.10968377088779578


 40%|████      | 4/10 [00:01<00:01,  3.34it/s]

Epoch 3, Loss: 0.10942225902268102


 50%|█████     | 5/10 [00:01<00:01,  3.35it/s]

Epoch 4, Loss: 0.10935651886345836


 60%|██████    | 6/10 [00:01<00:01,  3.35it/s]

Epoch 5, Loss: 0.10870184509282776


 70%|███████   | 7/10 [00:02<00:00,  3.34it/s]

Epoch 6, Loss: 0.10775365567647721


 80%|████████  | 8/10 [00:02<00:00,  3.34it/s]

Epoch 7, Loss: 0.1085638064392352


 90%|█████████ | 9/10 [00:02<00:00,  3.33it/s]

Epoch 8, Loss: 0.10812687047315091


100%|██████████| 10/10 [00:02<00:00,  3.34it/s]

Epoch 9, Loss: 0.10842248536540891





After interaction 12, reward = 50.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:02,  3.10it/s]

Epoch 0, Loss: 0.11900170431669248


 20%|██        | 2/10 [00:00<00:02,  3.07it/s]

Epoch 1, Loss: 0.11060568559896512


 30%|███       | 3/10 [00:00<00:02,  3.05it/s]

Epoch 2, Loss: 0.11073204226638012


 40%|████      | 4/10 [00:01<00:01,  3.04it/s]

Epoch 3, Loss: 0.11018494009649273


 50%|█████     | 5/10 [00:01<00:01,  3.04it/s]

Epoch 4, Loss: 0.10822944367998587


 60%|██████    | 6/10 [00:01<00:01,  3.02it/s]

Epoch 5, Loss: 0.10797080017384526


 70%|███████   | 7/10 [00:02<00:00,  3.02it/s]

Epoch 6, Loss: 0.10768880081182619


 80%|████████  | 8/10 [00:02<00:00,  3.03it/s]

Epoch 7, Loss: 0.1080722351033779


 90%|█████████ | 9/10 [00:02<00:00,  3.03it/s]

Epoch 8, Loss: 0.10692768694142442


100%|██████████| 10/10 [00:03<00:00,  3.03it/s]

Epoch 9, Loss: 0.10632854389464491





After interaction 13, reward = 10.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:03,  2.91it/s]

Epoch 0, Loss: 0.11516270026121714


 20%|██        | 2/10 [00:00<00:02,  2.89it/s]

Epoch 1, Loss: 0.10841624745469224


 30%|███       | 3/10 [00:01<00:02,  2.85it/s]

Epoch 2, Loss: 0.10791664914241754


 40%|████      | 4/10 [00:01<00:02,  2.86it/s]

Epoch 3, Loss: 0.10668327201836957


 50%|█████     | 5/10 [00:01<00:01,  2.85it/s]

Epoch 4, Loss: 0.1068890533248072


 60%|██████    | 6/10 [00:02<00:01,  2.85it/s]

Epoch 5, Loss: 0.10741171446905018


 70%|███████   | 7/10 [00:02<00:01,  2.84it/s]

Epoch 6, Loss: 0.10619710945857093


 80%|████████  | 8/10 [00:02<00:00,  2.83it/s]

Epoch 7, Loss: 0.10620245840701235


 90%|█████████ | 9/10 [00:03<00:00,  2.84it/s]

Epoch 8, Loss: 0.10684376158536861


100%|██████████| 10/10 [00:03<00:00,  2.85it/s]

Epoch 9, Loss: 0.10584820858940654





After interaction 14, reward = 105.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:03,  2.70it/s]

Epoch 0, Loss: 0.11473098298938664


 20%|██        | 2/10 [00:00<00:03,  2.48it/s]

Epoch 1, Loss: 0.10841992510093967


 30%|███       | 3/10 [00:01<00:02,  2.54it/s]

Epoch 2, Loss: 0.10867164704797347


 40%|████      | 4/10 [00:01<00:02,  2.57it/s]

Epoch 3, Loss: 0.10802260978604142


 50%|█████     | 5/10 [00:01<00:01,  2.60it/s]

Epoch 4, Loss: 0.10749561572437412


 60%|██████    | 6/10 [00:02<00:01,  2.62it/s]

Epoch 5, Loss: 0.10719509167861303


 70%|███████   | 7/10 [00:02<00:01,  2.62it/s]

Epoch 6, Loss: 0.10712214938010448


 80%|████████  | 8/10 [00:03<00:00,  2.63it/s]

Epoch 7, Loss: 0.10638310448978118


 90%|█████████ | 9/10 [00:03<00:00,  2.64it/s]

Epoch 8, Loss: 0.1059758654130471


100%|██████████| 10/10 [00:03<00:00,  2.61it/s]

Epoch 9, Loss: 0.10588019319293372





After interaction 15, reward = 230.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:03,  2.37it/s]

Epoch 0, Loss: 0.11894153044513511


 20%|██        | 2/10 [00:00<00:03,  2.36it/s]

Epoch 1, Loss: 0.11081782980149989


 30%|███       | 3/10 [00:01<00:02,  2.34it/s]

Epoch 2, Loss: 0.11059385709649554


 40%|████      | 4/10 [00:01<00:02,  2.35it/s]

Epoch 3, Loss: 0.11021696628709769


 50%|█████     | 5/10 [00:02<00:02,  2.34it/s]

Epoch 4, Loss: 0.10921256164014563


 60%|██████    | 6/10 [00:02<00:01,  2.34it/s]

Epoch 5, Loss: 0.1075783321469955


 70%|███████   | 7/10 [00:02<00:01,  2.34it/s]

Epoch 6, Loss: 0.10674369440009668


 80%|████████  | 8/10 [00:03<00:00,  2.34it/s]

Epoch 7, Loss: 0.10658652688032527


 90%|█████████ | 9/10 [00:03<00:00,  2.34it/s]

Epoch 8, Loss: 0.10629170893658545


100%|██████████| 10/10 [00:04<00:00,  2.34it/s]

Epoch 9, Loss: 0.10580227908624394





After interaction 16, reward = 120.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:04,  2.21it/s]

Epoch 0, Loss: 0.11405277497879493


 20%|██        | 2/10 [00:00<00:03,  2.20it/s]

Epoch 1, Loss: 0.10915346456875136


 30%|███       | 3/10 [00:01<00:03,  2.19it/s]

Epoch 2, Loss: 0.10803748997725726


 40%|████      | 4/10 [00:01<00:02,  2.18it/s]

Epoch 3, Loss: 0.10768166899457034


 50%|█████     | 5/10 [00:02<00:02,  2.17it/s]

Epoch 4, Loss: 0.10711882420067229


 60%|██████    | 6/10 [00:02<00:01,  2.18it/s]

Epoch 5, Loss: 0.10670154194245546


 70%|███████   | 7/10 [00:03<00:01,  2.18it/s]

Epoch 6, Loss: 0.10638041108708582


 80%|████████  | 8/10 [00:03<00:00,  2.18it/s]

Epoch 7, Loss: 0.1063033713503144


 90%|█████████ | 9/10 [00:04<00:00,  2.18it/s]

Epoch 8, Loss: 0.10544459461261785


100%|██████████| 10/10 [00:04<00:00,  2.18it/s]

Epoch 9, Loss: 0.10531763100706934





After interaction 17, reward = 55.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:04,  2.05it/s]

Epoch 0, Loss: 0.11162652816915292


 20%|██        | 2/10 [00:01<00:04,  1.92it/s]

Epoch 1, Loss: 0.10764004038971768


 30%|███       | 3/10 [00:01<00:03,  1.92it/s]

Epoch 2, Loss: 0.1071058499079303


 40%|████      | 4/10 [00:02<00:03,  1.96it/s]

Epoch 3, Loss: 0.10597897066910696


 50%|█████     | 5/10 [00:02<00:02,  1.99it/s]

Epoch 4, Loss: 0.1071021876753436


 60%|██████    | 6/10 [00:03<00:01,  2.00it/s]

Epoch 5, Loss: 0.10651508837428196


 70%|███████   | 7/10 [00:03<00:01,  2.01it/s]

Epoch 6, Loss: 0.10538716438650064


 80%|████████  | 8/10 [00:04<00:00,  2.02it/s]

Epoch 7, Loss: 0.1052160409620673


 90%|█████████ | 9/10 [00:04<00:00,  2.02it/s]

Epoch 8, Loss: 0.10465910078723467


100%|██████████| 10/10 [00:04<00:00,  2.00it/s]

Epoch 9, Loss: 0.10445741576297728





After interaction 18, reward = 105.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:04,  1.90it/s]

Epoch 0, Loss: 0.11020297001243522


 20%|██        | 2/10 [00:01<00:04,  1.92it/s]

Epoch 1, Loss: 0.10747082053487114


 30%|███       | 3/10 [00:01<00:03,  1.93it/s]

Epoch 2, Loss: 0.10674014603119551


 40%|████      | 4/10 [00:02<00:03,  1.93it/s]

Epoch 3, Loss: 0.10599125767772755


 50%|█████     | 5/10 [00:02<00:02,  1.94it/s]

Epoch 4, Loss: 0.10702299642762866


 60%|██████    | 6/10 [00:03<00:02,  1.94it/s]

Epoch 5, Loss: 0.10585100361601338


 70%|███████   | 7/10 [00:03<00:01,  1.94it/s]

Epoch 6, Loss: 0.10613361601247204


 80%|████████  | 8/10 [00:04<00:01,  1.94it/s]

Epoch 7, Loss: 0.10573663616257165


 90%|█████████ | 9/10 [00:04<00:00,  1.95it/s]

Epoch 8, Loss: 0.10349041825909658


100%|██████████| 10/10 [00:05<00:00,  1.94it/s]

Epoch 9, Loss: 0.10571012280611056





After interaction 19, reward = 135.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:05,  1.79it/s]

Epoch 0, Loss: 0.11271823844106403


 20%|██        | 2/10 [00:01<00:04,  1.77it/s]

Epoch 1, Loss: 0.10784577923248284


 30%|███       | 3/10 [00:01<00:03,  1.77it/s]

Epoch 2, Loss: 0.10679820638177354


 40%|████      | 4/10 [00:02<00:03,  1.77it/s]

Epoch 3, Loss: 0.10680551961729795


 50%|█████     | 5/10 [00:02<00:02,  1.76it/s]

Epoch 4, Loss: 0.10671926697915123


 60%|██████    | 6/10 [00:03<00:02,  1.76it/s]

Epoch 5, Loss: 0.10532743660254018


 70%|███████   | 7/10 [00:04<00:01,  1.72it/s]

Epoch 6, Loss: 0.10561386662613813


 80%|████████  | 8/10 [00:04<00:01,  1.73it/s]

Epoch 7, Loss: 0.10425079229391518


 90%|█████████ | 9/10 [00:05<00:00,  1.74it/s]

Epoch 8, Loss: 0.10456700417923348


100%|██████████| 10/10 [00:05<00:00,  1.74it/s]

Epoch 9, Loss: 0.10351130389911162





After interaction 20, reward = 105.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:05,  1.69it/s]

Epoch 0, Loss: 0.10864278081005357


 20%|██        | 2/10 [00:01<00:04,  1.68it/s]

Epoch 1, Loss: 0.10580366194586062


 30%|███       | 3/10 [00:01<00:04,  1.68it/s]

Epoch 2, Loss: 0.10468352976835758


 40%|████      | 4/10 [00:02<00:03,  1.68it/s]

Epoch 3, Loss: 0.10452513555515407


 50%|█████     | 5/10 [00:02<00:02,  1.68it/s]

Epoch 4, Loss: 0.10346143193780769


 60%|██████    | 6/10 [00:03<00:02,  1.68it/s]

Epoch 5, Loss: 0.10370644841704406


 70%|███████   | 7/10 [00:04<00:01,  1.68it/s]

Epoch 6, Loss: 0.10430152622999325


 80%|████████  | 8/10 [00:04<00:01,  1.67it/s]

Epoch 7, Loss: 0.1032295474459932


 90%|█████████ | 9/10 [00:05<00:00,  1.66it/s]

Epoch 8, Loss: 0.1033684552536375


100%|██████████| 10/10 [00:05<00:00,  1.67it/s]

Epoch 9, Loss: 0.1022455197945564





After interaction 21, reward = 15.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:05,  1.61it/s]

Epoch 0, Loss: 0.1078224578657841


 20%|██        | 2/10 [00:01<00:05,  1.60it/s]

Epoch 1, Loss: 0.10442947936746308


 30%|███       | 3/10 [00:01<00:04,  1.60it/s]

Epoch 2, Loss: 0.10478713615218406


 40%|████      | 4/10 [00:02<00:03,  1.59it/s]

Epoch 3, Loss: 0.10494696409397422


 50%|█████     | 5/10 [00:03<00:03,  1.59it/s]

Epoch 4, Loss: 0.1040251920133785


 60%|██████    | 6/10 [00:03<00:02,  1.59it/s]

Epoch 5, Loss: 0.10358312831833556


 70%|███████   | 7/10 [00:04<00:01,  1.59it/s]

Epoch 6, Loss: 0.10269933484186945


 80%|████████  | 8/10 [00:05<00:01,  1.59it/s]

Epoch 7, Loss: 0.10277498740352639


 90%|█████████ | 9/10 [00:05<00:00,  1.54it/s]

Epoch 8, Loss: 0.10227263818655027


100%|██████████| 10/10 [00:06<00:00,  1.57it/s]

Epoch 9, Loss: 0.10239946517121877





After interaction 22, reward = 85.0
Training the learner
Training for 10 epochs


 10%|█         | 1/10 [00:00<00:05,  1.52it/s]

Epoch 0, Loss: 0.10643640436732842


 20%|██        | 2/10 [00:01<00:05,  1.51it/s]

Epoch 1, Loss: 0.10410253254113092


 30%|███       | 3/10 [00:01<00:04,  1.51it/s]

Epoch 2, Loss: 0.10378539958136586


 40%|████      | 4/10 [00:02<00:03,  1.50it/s]

Epoch 3, Loss: 0.10507265418778965


 50%|█████     | 5/10 [00:03<00:03,  1.51it/s]

Epoch 4, Loss: 0.10355867628893985


 60%|██████    | 6/10 [00:04<00:02,  1.47it/s]

Epoch 5, Loss: 0.1022490273345137





KeyboardInterrupt: 