In [1]:
import os.path as osp
from time import time
from datetime import datetime, timedelta
from functools import partial

import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as T
from torchvision.datasets import MNIST

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler



In [2]:
class FFNN(nn.Module):
    def __init__(self, in_shape, out_shape, p_d1=0.5, p_d2=0.4, h1=64, h2=32):
        super().__init__()
        fc1 = nn.Linear(in_shape, h1)
        a1  = nn.ReLU()
        d1  = nn.Dropout(p=p_d1)
        fc2 = nn.Linear(h1, h2)
        a2  = nn.ReLU()
        d2  = nn.Dropout(p=p_d2)
        fc3 = nn.Linear(h2, out_shape)
        
        # not applying log_softmax here, as it is applied later in 
        # the torch CCE loss
        
        self.nn = nn.Sequential(fc1, a1, d1, fc2, a2, d2, fc3)

    def forward(self, x):
        x = self.nn(x)
        return x

In [3]:
def train_mnist(config, epochs, checkpoint_dir=None, data_dir=None):
    # create model
    model = FFNN(784, 10, 
                 p_d1=config['p_d1'], 
                 p_d2=config['p_d2'], 
                 h1=config['h1'], 
                 h2=config['h2'])
    
    # load data and make a validation split
    transforms = T.Compose([T.ToTensor(), T.Normalize((0.5,),(0.5)), 
                            T.Lambda(lambda x: torch.flatten(x))])
    dataset_train = MNIST(root='/data/', transform=transforms, train=True)

    train_samples = int(len(dataset_train) * 0.8)
    train_subset, val_subset = random_split(dataset_train,
                                           [train_samples, 
                                            len(dataset_train) - train_samples])
    # create dataloaders
    train_args = {'dataset':train_subset, 
                  'batch_size':config['batch_size'], 
                  'shuffle':True, 
                  'num_workers':8, 
                  'pin_memory':True}
    dataloader_train = torch.utils.data.DataLoader(**train_args)
    val_args  = {'dataset':val_subset, 
                  'batch_size':len(val_subset), 
                  'shuffle':False, 
                  'num_workers':8}
    dataloader_val  = torch.utils.data.DataLoader(**val_args) 
    
    # choose computation host device
    device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device_name)
    model.to(device)
    
    
    optimiser = torch.optim.SGD(params=model.parameters(), lr=config['lr'], momentum=0.9)
    f_loss = nn.CrossEntropyLoss()
    
    # training loop
    for n in range(epochs):
        total_loss = 0.0
        # optimisation
        model.train()
        for idx, (X, y) in enumerate(dataloader_train):
            X, y = X.to(device), y.to(device)
            optimiser.zero_grad()
            y_pred = model(X)
            loss = f_loss(y_pred, y)
            loss.backward()
            total_loss += loss.detach().cpu().item() / len(y) # normalise for batch size
            optimiser.step()
            
        # validation set metrics
        predictions, targets, val_losses = [], [], []
        model.eval()
        # we are adding the metrics tensor for each batch to a list,
        # then concatenating at the end to make one tensor with all samples
        for idx, (X, y) in enumerate(dataloader_val):
            with torch.no_grad():
                y_pred = model(X)
                predictions.append(y_pred.detach())
                targets.append(y)
                val_losses.append(f_loss(y_pred, y).cpu().item())

        predictions = torch.cat(predictions, dim=0)
        targets = torch.cat(targets, dim=0)
        predictions = torch.argmax(F.log_softmax(predictions, dim=1),dim=1)
        corrects = (predictions == targets).sum().item()
        wrongs = len(targets) - corrects
        val_accuracy = corrects / len(targets)
        val_loss = sum(val_losses) / float(len(val_losses))
        
        # save checkpoint
        with tune.checkpoint_dir(n) as checkpoint_dir:
            path = osp.join(checkpoint_dir, 'checkpoint')
            torch.save((model.state_dict(), optimiser.state_dict()), path)
            
        # report metric values back to main scheduler
        tune.report(loss=val_loss, accuracy=val_accuracy)
        
def test_accuracy(model, device='cpu'):
    transforms = T.Compose([T.ToTensor(), T.Normalize((0.5,),(0.5)), 
                            T.Lambda(lambda x: torch.flatten(x))])
    dataset_test  = MNIST(root='/data/', transform=transforms, train=False)
    test_args  = {'dataset':dataset_test, 
                  'batch_size':len(dataset_test), 
                  'shuffle':False, 
                  'num_workers':8}
    dataloader_test  = torch.utils.data.DataLoader(**test_args) 
    
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for (X, y) in dataloader_test:
            y_pred = model(X)
            predictions.append(y_pred.detach())
            targets.append(y)

        predictions = torch.cat(predictions, dim=0)
        targets = torch.cat(targets, dim=0)
        predictions = torch.argmax(F.log_softmax(predictions, dim=1),dim=1)
        corrects = (predictions == targets).sum().item()
        wrongs = len(targets) - corrects
        test_accuracy = corrects / len(targets)
        
    return  test_accuracy

In [4]:
max_epochs = 20
num_samples = 10

config = {'lr':tune.loguniform(1e-4, 1e-1), 
          'batch_size':tune.choice([32, 64, 256, 512]), 
          'p_d1':tune.uniform(0.1,0.9), 
          'p_d2':tune.uniform(0.1,0.9), 
          'h1':tune.choice([32, 64, 256, 512, 1024]), 
          'h2':tune.choice([32, 64, 256, 512, 1024])}


scheduler = ASHAScheduler(metric='loss', 
                        mode='min',
                        max_t=20, 
                        grace_period=2, 
                        reduction_factor=2)

reporter  = CLIReporter(metric_columns=['loss', 'accuracy', 'training_iteration'])

resources = {'cpu':2} 
if torch.cuda.is_available():
    resources['gpu'] = 0.5
    
result = tune.run(partial(train_mnist, epochs=max_epochs),
                  resources_per_trial=resources, 
                  config=config, 
                  num_samples=num_samples, 
                  scheduler=scheduler, 
                  progress_reporter=reporter)

best_trial = result.get_best_trial('loss', 'min', 'last')

2021-07-07 10:15:09,476	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-07-07 10:15:11,307	INFO registry.py:64 -- Detected unknown callable for trainable. Converting to class.
2021-07-07 10:15:11,601	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


== Status ==
Memory usage on this node: 1.9/7.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects
Result logdir: /home/chris/ray_results/DEFAULT_2021-07-07_10-15-11
Number of trials: 10/10 (10 PENDING)
+---------------------+----------+-------+--------------+------+------+-------------+----------+----------+
| Trial name          | status   | loc   |   batch_size |   h1 |   h2 |          lr |     p_d1 |     p_d2 |
|---------------------+----------+-------+--------------+------+------+-------------+----------+----------|
| DEFAULT_d4cef_00000 | PENDING  |       |           32 | 1024 |   32 | 0.00397303  | 0.281003 | 0.220526 |
| DEFAULT_d4cef_00001 | PENDING  |       |           32 |  512 |   64 | 0.0199141   | 0.704943 | 0.588271 |
| DEFAULT_d4cef_00002 | PENDING  |       |           64 |  256 | 1024 | 0.000969792 | 0.847073 | 0.5156

[2m[36m(pid=144956)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=144958)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=144959)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=144960)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


== Status ==
Memory usage on this node: 2.8/7.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects
Result logdir: /home/chris/ray_results/DEFAULT_2021-07-07_10-15-11
Number of trials: 10/10 (6 PENDING, 4 RUNNING)
+---------------------+----------+-------+--------------+------+------+-------------+----------+----------+
| Trial name          | status   | loc   |   batch_size |   h1 |   h2 |          lr |     p_d1 |     p_d2 |
|---------------------+----------+-------+--------------+------+------+-------------+----------+----------|
| DEFAULT_d4cef_00000 | RUNNING  |       |           32 | 1024 |   32 | 0.00397303  | 0.281003 | 0.220526 |
| DEFAULT_d4cef_00001 | RUNNING  |       |           32 |  512 |   64 | 0.0199141   | 0.704943 | 0.588271 |
| DEFAULT_d4cef_00002 | RUNNING  |       |           64 |  256 | 1024 | 0.000969792 | 0.847

Result for DEFAULT_d4cef_00003:
  accuracy: 0.7668333333333334
  date: 2021-07-07_10-15-46
  done: false
  experiment_id: e94b76da220347519fa20507605b080b
  hostname: chris-server
  iterations_since_restore: 1
  loss: 0.7905436158180237
  node_ip: 192.168.1.58
  pid: 144960
  should_checkpoint: true
  time_since_restore: 33.733715772628784
  time_this_iter_s: 33.733715772628784
  time_total_s: 33.733715772628784
  timestamp: 1625649346
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: d4cef_00003
  
Result for DEFAULT_d4cef_00000:
  accuracy: 0.93275
  date: 2021-07-07_10-15-51
  done: false
  experiment_id: afce06fdab70461f9fc10348aa37716a
  hostname: chris-server
  iterations_since_restore: 1
  loss: 0.2378310114145279
  node_ip: 192.168.1.58
  pid: 144958
  should_checkpoint: true
  time_since_restore: 39.328484535217285
  time_this_iter_s: 39.328484535217285
  time_total_s: 39.328484535217285
  timestamp: 1625649351
  timesteps_since_restore: 0
  training_iteration: 

Result for DEFAULT_d4cef_00002:
  accuracy: 0.8845
  date: 2021-07-07_10-16-16
  done: false
  experiment_id: 65deb7a803c24555933ba9bec49f21e6
  hostname: chris-server
  iterations_since_restore: 3
  loss: 0.40108102560043335
  node_ip: 192.168.1.58
  pid: 144959
  should_checkpoint: true
  time_since_restore: 64.09695959091187
  time_this_iter_s: 21.3787624835968
  time_total_s: 64.09695959091187
  timestamp: 1625649376
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: d4cef_00002
  


[2m[36m(pid=144957)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Result for DEFAULT_d4cef_00003:
  accuracy: 0.7025
  date: 2021-07-07_10-16-23
  done: true
  experiment_id: e94b76da220347519fa20507605b080b
  hostname: chris-server
  iterations_since_restore: 2
  loss: 0.9275482296943665
  node_ip: 192.168.1.58
  pid: 144960
  should_checkpoint: true
  time_since_restore: 71.40982007980347
  time_this_iter_s: 37.67610430717468
  time_total_s: 71.40982007980347
  timestamp: 1625649383
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: d4cef_00003
  
== Status ==
Memory usage on this node: 2.8/7.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: -0.9275482296943665
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_0_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_1666f0e45b667b54936835566296a213, 0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_g

[2m[36m(pid=144955)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Result for DEFAULT_d4cef_00000:
  accuracy: 0.9485
  date: 2021-07-07_10-16-28
  done: false
  experiment_id: afce06fdab70461f9fc10348aa37716a
  hostname: chris-server
  iterations_since_restore: 2
  loss: 0.1698533594608307
  node_ip: 192.168.1.58
  pid: 144958
  should_checkpoint: true
  time_since_restore: 75.73120856285095
  time_this_iter_s: 36.40272402763367
  time_total_s: 75.73120856285095
  timestamp: 1625649388
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: d4cef_00000
  
Result for DEFAULT_d4cef_00002:
  accuracy: 0.8989166666666667
  date: 2021-07-07_10-16-37
  done: false
  experiment_id: 65deb7a803c24555933ba9bec49f21e6
  hostname: chris-server
  iterations_since_restore: 4
  loss: 0.35762104392051697
  node_ip: 192.168.1.58
  pid: 144959
  should_checkpoint: true
  time_since_restore: 84.86661744117737
  time_this_iter_s: 20.769657850265503
  time_total_s: 84.86661744117737
  timestamp: 1625649397
  timesteps_since_restore: 0
  training_iteration: 4
  t

Result for DEFAULT_d4cef_00005:
  accuracy: 0.9171666666666667
  date: 2021-07-07_10-16-52
  done: false
  experiment_id: 9ab16d3f32b94ad3bcc43f362c08b479
  hostname: chris-server
  iterations_since_restore: 2
  loss: 0.2814309895038605
  node_ip: 192.168.1.58
  pid: 144955
  should_checkpoint: true
  time_since_restore: 26.896984100341797
  time_this_iter_s: 13.595092535018921
  time_total_s: 26.896984100341797
  timestamp: 1625649412
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: d4cef_00005
  
Result for DEFAULT_d4cef_00002:
  accuracy: 0.9054166666666666
  date: 2021-07-07_10-16-58
  done: false
  experiment_id: 65deb7a803c24555933ba9bec49f21e6
  hostname: chris-server
  iterations_since_restore: 5
  loss: 0.3283148407936096
  node_ip: 192.168.1.58
  pid: 144959
  should_checkpoint: true
  time_since_restore: 106.44252681732178
  time_this_iter_s: 21.57590937614441
  time_total_s: 106.44252681732178
  timestamp: 1625649418
  timesteps_since_restore: 0
  training_i

Result for DEFAULT_d4cef_00002:
  accuracy: 0.911
  date: 2021-07-07_10-17-19
  done: false
  experiment_id: 65deb7a803c24555933ba9bec49f21e6
  hostname: chris-server
  iterations_since_restore: 6
  loss: 0.30626189708709717
  node_ip: 192.168.1.58
  pid: 144959
  should_checkpoint: true
  time_since_restore: 126.94026041030884
  time_this_iter_s: 20.49773359298706
  time_total_s: 126.94026041030884
  timestamp: 1625649439
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: d4cef_00002
  
== Status ==
Memory usage on this node: 2.9/7.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: -0.35762104392051697 | Iter 2.000: -0.4982490539550781
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_1666f0e45b667b54936835566296a213, 0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_1666f0e45b667b54936835566296a2

Result for DEFAULT_d4cef_00002:
  accuracy: 0.911
  date: 2021-07-07_10-17-40
  done: false
  experiment_id: 65deb7a803c24555933ba9bec49f21e6
  hostname: chris-server
  iterations_since_restore: 7
  loss: 0.2991997003555298
  node_ip: 192.168.1.58
  pid: 144959
  should_checkpoint: true
  time_since_restore: 148.37695002555847
  time_this_iter_s: 21.436689615249634
  time_total_s: 148.37695002555847
  timestamp: 1625649460
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: d4cef_00002
  
== Status ==
Memory usage on this node: 3.1/7.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: -0.2807219624519348 | Iter 2.000: -0.3898400217294693
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_0_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_1666f0e45b667b54936835566296a213, 0.0/2.0 CPU_group_9dfef44093f07a6ba130c6d757ccae3

Result for DEFAULT_d4cef_00002:
  accuracy: 0.918
  date: 2021-07-07_10-18-02
  done: false
  experiment_id: 65deb7a803c24555933ba9bec49f21e6
  hostname: chris-server
  iterations_since_restore: 8
  loss: 0.2767053246498108
  node_ip: 192.168.1.58
  pid: 144959
  should_checkpoint: true
  time_since_restore: 170.01902556419373
  time_this_iter_s: 21.642075538635254
  time_total_s: 170.01902556419373
  timestamp: 1625649482
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: d4cef_00002
  
Result for DEFAULT_d4cef_00004:
  accuracy: 0.9533333333333334
  date: 2021-07-07_10-18-06
  done: false
  experiment_id: 6f977986f456449abe7fd13205cd4e50
  hostname: chris-server
  iterations_since_restore: 3
  loss: 0.1521303653717041
  node_ip: 192.168.1.58
  pid: 144957
  should_checkpoint: true
  time_since_restore: 110.07843351364136
  time_this_iter_s: 37.0233428478241
  time_total_s: 110.07843351364136
  timestamp: 1625649486
  timesteps_since_restore: 0
  training_iteration: 3
  

Result for DEFAULT_d4cef_00002:
  accuracy: 0.91975
  date: 2021-07-07_10-18-22
  done: false
  experiment_id: 65deb7a803c24555933ba9bec49f21e6
  hostname: chris-server
  iterations_since_restore: 9
  loss: 0.2679392099380493
  node_ip: 192.168.1.58
  pid: 144959
  should_checkpoint: true
  time_since_restore: 190.28488945960999
  time_this_iter_s: 20.26586389541626
  time_total_s: 190.28488945960999
  timestamp: 1625649502
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: d4cef_00002
  
== Status ==
Memory usage on this node: 2.9/7.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 16.000: None | Iter 8.000: -0.20995624363422394 | Iter 4.000: -0.20382288098335266 | Iter 2.000: -0.3898400217294693
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_0_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_0_1666f0e45

Result for DEFAULT_d4cef_00004:
  accuracy: 0.9583333333333334
  date: 2021-07-07_10-18-44
  done: false
  experiment_id: 6f977986f456449abe7fd13205cd4e50
  hostname: chris-server
  iterations_since_restore: 4
  loss: 0.13659362494945526
  node_ip: 192.168.1.58
  pid: 144957
  should_checkpoint: true
  time_since_restore: 148.6800594329834
  time_this_iter_s: 38.60162591934204
  time_total_s: 148.6800594329834
  timestamp: 1625649524
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: d4cef_00004
  
Result for DEFAULT_d4cef_00005:
  accuracy: 0.96525
  date: 2021-07-07_10-18-52
  done: false
  experiment_id: 9ab16d3f32b94ad3bcc43f362c08b479
  hostname: chris-server
  iterations_since_restore: 11
  loss: 0.11868540197610855
  node_ip: 192.168.1.58
  pid: 144955
  should_checkpoint: true
  time_since_restore: 147.2964735031128
  time_this_iter_s: 12.21390414237976
  time_total_s: 147.2964735031128
  timestamp: 1625649532
  timesteps_since_restore: 0
  training_iteration: 11


Result for DEFAULT_d4cef_00005:
  accuracy: 0.966
  date: 2021-07-07_10-19-06
  done: false
  experiment_id: 9ab16d3f32b94ad3bcc43f362c08b479
  hostname: chris-server
  iterations_since_restore: 12
  loss: 0.11576300114393234
  node_ip: 192.168.1.58
  pid: 144955
  should_checkpoint: true
  time_since_restore: 160.86883926391602
  time_this_iter_s: 13.572365760803223
  time_total_s: 160.86883926391602
  timestamp: 1625649546
  timesteps_since_restore: 0
  training_iteration: 12
  trial_id: d4cef_00005
  
Result for DEFAULT_d4cef_00005:
  accuracy: 0.9681666666666666
  date: 2021-07-07_10-19-20
  done: false
  experiment_id: 9ab16d3f32b94ad3bcc43f362c08b479
  hostname: chris-server
  iterations_since_restore: 13
  loss: 0.10839436203241348
  node_ip: 192.168.1.58
  pid: 144955
  should_checkpoint: true
  time_since_restore: 175.05366230010986
  time_this_iter_s: 14.184823036193848
  time_total_s: 175.05366230010986
  timestamp: 1625649560
  timesteps_since_restore: 0
  training_iteratio

Result for DEFAULT_d4cef_00000:
  accuracy: 0.9686666666666667
  date: 2021-07-07_10-19-30
  done: false
  experiment_id: afce06fdab70461f9fc10348aa37716a
  hostname: chris-server
  iterations_since_restore: 7
  loss: 0.10417016595602036
  node_ip: 192.168.1.58
  pid: 144958
  should_checkpoint: true
  time_since_restore: 258.27175211906433
  time_this_iter_s: 36.527605056762695
  time_total_s: 258.27175211906433
  timestamp: 1625649570
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: d4cef_00000
  
Result for DEFAULT_d4cef_00005:
  accuracy: 0.9679166666666666
  date: 2021-07-07_10-19-32
  done: false
  experiment_id: 9ab16d3f32b94ad3bcc43f362c08b479
  hostname: chris-server
  iterations_since_restore: 14
  loss: 0.10760781168937683
  node_ip: 192.168.1.58
  pid: 144955
  should_checkpoint: true
  time_since_restore: 187.3245825767517
  time_this_iter_s: 12.270920276641846
  time_total_s: 187.3245825767517
  timestamp: 1625649572
  timesteps_since_restore: 0
  training

Result for DEFAULT_d4cef_00005:
  accuracy: 0.9701666666666666
  date: 2021-07-07_10-20-00
  done: false
  experiment_id: 9ab16d3f32b94ad3bcc43f362c08b479
  hostname: chris-server
  iterations_since_restore: 16
  loss: 0.10233167558908463
  node_ip: 192.168.1.58
  pid: 144955
  should_checkpoint: true
  time_since_restore: 214.88869452476501
  time_this_iter_s: 13.832619190216064
  time_total_s: 214.88869452476501
  timestamp: 1625649600
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: d4cef_00005
  
== Status ==
Memory usage on this node: 3.0/7.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 16.000: -0.10233167558908463 | Iter 8.000: -0.20995624363422394 | Iter 4.000: -0.17020825296640396 | Iter 2.000: -0.3898400217294693
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_1666f0e45b667b54936835566296a213, 0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_9dfef44093f07a6ba130c6d757ccae3f, 0.

Result for DEFAULT_d4cef_00005:
  accuracy: 0.9718333333333333
  date: 2021-07-07_10-20-12
  done: false
  experiment_id: 9ab16d3f32b94ad3bcc43f362c08b479
  hostname: chris-server
  iterations_since_restore: 17
  loss: 0.09679713100194931
  node_ip: 192.168.1.58
  pid: 144955
  should_checkpoint: true
  time_since_restore: 227.21265363693237
  time_this_iter_s: 12.323959112167358
  time_total_s: 227.21265363693237
  timestamp: 1625649612
  timesteps_since_restore: 0
  training_iteration: 17
  trial_id: d4cef_00005
  
== Status ==
Memory usage on this node: 3.0/7.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 16.000: -0.10233167558908463 | Iter 8.000: -0.14320716261863708 | Iter 4.000: -0.17020825296640396 | Iter 2.000: -0.3898400217294693
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_0_9dfef44093f07a6ba130c6d757ccae3f, 

Result for DEFAULT_d4cef_00004:
  accuracy: 0.9629166666666666
  date: 2021-07-07_10-20-40
  done: false
  experiment_id: 6f977986f456449abe7fd13205cd4e50
  hostname: chris-server
  iterations_since_restore: 7
  loss: 0.11811651289463043
  node_ip: 192.168.1.58
  pid: 144957
  should_checkpoint: true
  time_since_restore: 264.2965919971466
  time_this_iter_s: 38.852609157562256
  time_total_s: 264.2965919971466
  timestamp: 1625649640
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: d4cef_00004
  
Result for DEFAULT_d4cef_00000:
  accuracy: 0.9715
  date: 2021-07-07_10-20-44
  done: false
  experiment_id: afce06fdab70461f9fc10348aa37716a
  hostname: chris-server
  iterations_since_restore: 9
  loss: 0.09595455229282379
  node_ip: 192.168.1.58
  pid: 144958
  should_checkpoint: true
  time_since_restore: 331.7384102344513
  time_this_iter_s: 36.50395965576172
  time_total_s: 331.7384102344513
  timestamp: 1625649644
  timesteps_since_restore: 0
  training_iteration: 9
  

[2m[36m(pid=144953)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Result for DEFAULT_d4cef_00005:
  accuracy: 0.9740833333333333
  date: 2021-07-07_10-20-52
  done: true
  experiment_id: 9ab16d3f32b94ad3bcc43f362c08b479
  hostname: chris-server
  iterations_since_restore: 20
  loss: 0.08966246247291565
  node_ip: 192.168.1.58
  pid: 144955
  should_checkpoint: true
  time_since_restore: 266.93277764320374
  time_this_iter_s: 12.453980684280396
  time_total_s: 266.93277764320374
  timestamp: 1625649652
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: d4cef_00005
  


[2m[36m(pid=144954)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Result for DEFAULT_d4cef_00007:
  accuracy: 0.4850833333333333
  date: 2021-07-07_10-21-04
  done: false
  experiment_id: 2e4e21eed21044eaa44b0f9c65f6d805
  hostname: chris-server
  iterations_since_restore: 1
  loss: 2.087460994720459
  node_ip: 192.168.1.58
  pid: 144954
  should_checkpoint: true
  time_since_restore: 10.539774894714355
  time_this_iter_s: 10.539774894714355
  time_total_s: 10.539774894714355
  timestamp: 1625649664
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: d4cef_00007
  
== Status ==
Memory usage on this node: 2.8/7.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 16.000: -0.16276514902710915 | Iter 8.000: -0.14320716261863708 | Iter 4.000: -0.17020825296640396 | Iter 2.000: -0.3898400217294693
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_0_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_1666f0e45b667b54936835566296a213, 0.0/



Result for DEFAULT_d4cef_00000:
  accuracy: 0.9710833333333333
  date: 2021-07-07_10-21-15
  done: false
  experiment_id: afce06fdab70461f9fc10348aa37716a
  hostname: chris-server
  iterations_since_restore: 10
  loss: 0.0982174277305603
  node_ip: 192.168.1.58
  pid: 144958
  should_checkpoint: true
  time_since_restore: 362.96229004859924
  time_this_iter_s: 31.22387981414795
  time_total_s: 362.96229004859924
  timestamp: 1625649675
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: d4cef_00000
  
Result for DEFAULT_d4cef_00004:
  accuracy: 0.969
  date: 2021-07-07_10-21-15
  done: false
  experiment_id: 6f977986f456449abe7fd13205cd4e50
  hostname: chris-server
  iterations_since_restore: 8
  loss: 0.10603876411914825
  node_ip: 192.168.1.58
  pid: 144957
  should_checkpoint: true
  time_since_restore: 299.42242908477783
  time_this_iter_s: 35.125837087631226
  time_total_s: 299.42242908477783
  timestamp: 1625649675
  timesteps_since_restore: 0
  training_iteration: 

[2m[36m(pid=155149)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Result for DEFAULT_d4cef_00006:
  accuracy: 0.8793333333333333
  date: 2021-07-07_10-21-19
  done: true
  experiment_id: 0de5651096f646a58375a0845aa1041c
  hostname: chris-server
  iterations_since_restore: 2
  loss: 0.6360002160072327
  node_ip: 192.168.1.58
  pid: 144953
  should_checkpoint: true
  time_since_restore: 28.261991262435913
  time_this_iter_s: 11.967723608016968
  time_total_s: 28.261991262435913
  timestamp: 1625649679
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: d4cef_00006
  
== Status ==
Memory usage on this node: 2.8/7.7 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 16.000: -0.16276514902710915 | Iter 8.000: -0.12462296336889267 | Iter 4.000: -0.17020825296640396 | Iter 2.000: -0.5671246349811554
Resources requested: 8.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_1666f0e45b667b54936835566296a213, 0.0/2.0 CPU_group_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/

[2m[36m(pid=155580)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Result for DEFAULT_d4cef_00008:
  accuracy: 0.5494166666666667
  date: 2021-07-07_10-21-22
  done: false
  experiment_id: dd329a9526764bcf9caa6dca5245ae49
  hostname: chris-server
  iterations_since_restore: 1
  loss: 1.912758469581604
  node_ip: 192.168.1.58
  pid: 155149
  should_checkpoint: true
  time_since_restore: 6.028026580810547
  time_this_iter_s: 6.028026580810547
  time_total_s: 6.028026580810547
  timestamp: 1625649682
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: d4cef_00008
  
Result for DEFAULT_d4cef_00008:
  accuracy: 0.757
  date: 2021-07-07_10-21-29
  done: true
  experiment_id: dd329a9526764bcf9caa6dca5245ae49
  hostname: chris-server
  iterations_since_restore: 2
  loss: 1.3960334062576294
  node_ip: 192.168.1.58
  pid: 155149
  should_checkpoint: true
  time_since_restore: 12.864919185638428
  time_this_iter_s: 6.836892604827881
  time_total_s: 12.864919185638428
  timestamp: 1625649689
  timesteps_since_restore: 0
  training_iteration: 2
  tria

Result for DEFAULT_d4cef_00004:
  accuracy: 0.9690833333333333
  date: 2021-07-07_10-21-48
  done: false
  experiment_id: 6f977986f456449abe7fd13205cd4e50
  hostname: chris-server
  iterations_since_restore: 9
  loss: 0.10347989201545715
  node_ip: 192.168.1.58
  pid: 144957
  should_checkpoint: true
  time_since_restore: 332.50760316848755
  time_this_iter_s: 33.08517408370972
  time_total_s: 332.50760316848755
  timestamp: 1625649708
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: d4cef_00004
  
Result for DEFAULT_d4cef_00009:
  accuracy: 0.9381666666666667
  date: 2021-07-07_10-22-01
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 2
  loss: 0.20191600918769836
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 39.60135054588318
  time_this_iter_s: 17.752320051193237
  time_total_s: 39.60135054588318
  timestamp: 1625649721
  timesteps_since_restore: 0
  training_i

Result for DEFAULT_d4cef_00009:
  accuracy: 0.9571666666666667
  date: 2021-07-07_10-22-39
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 4
  loss: 0.14761251211166382
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 77.9219925403595
  time_this_iter_s: 20.137728214263916
  time_total_s: 77.9219925403595
  timestamp: 1625649759
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: d4cef_00009
  
== Status ==
Memory usage on this node: 2.6/7.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 16.000: -0.16276514902710915 | Iter 8.000: -0.12462296336889267 | Iter 4.000: -0.14761251211166382 | Iter 2.000: -0.5671246349811554
Resources requested: 6.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_0_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.

Result for DEFAULT_d4cef_00009:
  accuracy: 0.959
  date: 2021-07-07_10-22-58
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 5
  loss: 0.1388627141714096
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 96.18566560745239
  time_this_iter_s: 18.263673067092896
  time_total_s: 96.18566560745239
  timestamp: 1625649778
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: d4cef_00009
  
== Status ==
Memory usage on this node: 2.6/7.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 16.000: -0.16276514902710915 | Iter 8.000: -0.12462296336889267 | Iter 4.000: -0.14761251211166382 | Iter 2.000: -0.5671246349811554
Resources requested: 6.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_0_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_grou

Result for DEFAULT_d4cef_00009:
  accuracy: 0.9638333333333333
  date: 2021-07-07_10-23-36
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 7
  loss: 0.12496575713157654
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 134.47105240821838
  time_this_iter_s: 19.497008085250854
  time_total_s: 134.47105240821838
  timestamp: 1625649816
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: d4cef_00009
  
== Status ==
Memory usage on this node: 2.6/7.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 16.000: -0.16276514902710915 | Iter 8.000: -0.12462296336889267 | Iter 4.000: -0.14761251211166382 | Iter 2.000: -0.5671246349811554
Resources requested: 6.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.

Result for DEFAULT_d4cef_00009:
  accuracy: 0.9655833333333333
  date: 2021-07-07_10-23-54
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 8
  loss: 0.11808866262435913
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 153.0720579624176
  time_this_iter_s: 18.60100555419922
  time_total_s: 153.0720579624176
  timestamp: 1625649834
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: d4cef_00009
  
Result for DEFAULT_d4cef_00000:
  accuracy: 0.9746666666666667
  date: 2021-07-07_10-24-11
  done: false
  experiment_id: afce06fdab70461f9fc10348aa37716a
  hostname: chris-server
  iterations_since_restore: 16
  loss: 0.08645176887512207
  node_ip: 192.168.1.58
  pid: 144958
  should_checkpoint: true
  time_since_restore: 539.5195891857147
  time_this_iter_s: 28.85742974281311
  time_total_s: 539.5195891857147
  timestamp: 1625649851
  timesteps_since_restore: 0
  training_ite

Result for DEFAULT_d4cef_00009:
  accuracy: 0.9638333333333333
  date: 2021-07-07_10-24-33
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 10
  loss: 0.11789297312498093
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 191.9699673652649
  time_this_iter_s: 19.88722825050354
  time_total_s: 191.9699673652649
  timestamp: 1625649873
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: d4cef_00009
  
== Status ==
Memory usage on this node: 2.6/7.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 16.000: -0.10233167558908463 | Iter 8.000: -0.11808866262435913 | Iter 4.000: -0.14761251211166382 | Iter 2.000: -0.5671246349811554
Resources requested: 6.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0/2

Result for DEFAULT_d4cef_00004:
  accuracy: 0.9736666666666667
  date: 2021-07-07_10-24-53
  done: false
  experiment_id: 6f977986f456449abe7fd13205cd4e50
  hostname: chris-server
  iterations_since_restore: 15
  loss: 0.08819311857223511
  node_ip: 192.168.1.58
  pid: 144957
  should_checkpoint: true
  time_since_restore: 517.6485028266907
  time_this_iter_s: 30.62800121307373
  time_total_s: 517.6485028266907
  timestamp: 1625649893
  timesteps_since_restore: 0
  training_iteration: 15
  trial_id: d4cef_00004
  
Result for DEFAULT_d4cef_00000:
  accuracy: 0.9784166666666667
  date: 2021-07-07_10-25-09
  done: false
  experiment_id: afce06fdab70461f9fc10348aa37716a
  hostname: chris-server
  iterations_since_restore: 18
  loss: 0.08167976140975952
  node_ip: 192.168.1.58
  pid: 144958
  should_checkpoint: true
  time_since_restore: 596.7229130268097
  time_this_iter_s: 28.599908590316772
  time_total_s: 596.7229130268097
  timestamp: 1625649909
  timesteps_since_restore: 0
  training_

Result for DEFAULT_d4cef_00009:
  accuracy: 0.9724166666666667
  date: 2021-07-07_10-25-30
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 13
  loss: 0.09370264410972595
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 249.10791873931885
  time_this_iter_s: 19.767194747924805
  time_total_s: 249.10791873931885
  timestamp: 1625649930
  timesteps_since_restore: 0
  training_iteration: 13
  trial_id: d4cef_00009
  
== Status ==
Memory usage on this node: 2.6/7.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 16.000: -0.09563710168004036 | Iter 8.000: -0.11808866262435913 | Iter 4.000: -0.14761251211166382 | Iter 2.000: -0.5671246349811554
Resources requested: 6.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_0_072cf17df9fdf3ff5298636eeb895304, 0.0/2.0 CPU_group_0_095b3f18f9ccb0d439ba4db6cd0d25

Result for DEFAULT_d4cef_00004:
  accuracy: 0.97625
  date: 2021-07-07_10-25-55
  done: false
  experiment_id: 6f977986f456449abe7fd13205cd4e50
  hostname: chris-server
  iterations_since_restore: 17
  loss: 0.08197546750307083
  node_ip: 192.168.1.58
  pid: 144957
  should_checkpoint: true
  time_since_restore: 579.5874133110046
  time_this_iter_s: 30.230751991271973
  time_total_s: 579.5874133110046
  timestamp: 1625649955
  timesteps_since_restore: 0
  training_iteration: 17
  trial_id: d4cef_00004
  
== Status ==
Memory usage on this node: 2.6/7.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 16.000: -0.09563710168004036 | Iter 8.000: -0.11808866262435913 | Iter 4.000: -0.14761251211166382 | Iter 2.000: -0.5671246349811554
Resources requested: 6.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_0_9dfef44093f07a6ba130c6d757ccae3f, 0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU

Result for DEFAULT_d4cef_00009:
  accuracy: 0.9730833333333333
  date: 2021-07-07_10-26-20
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 16
  loss: 0.0907982885837555
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 298.87469720840454
  time_this_iter_s: 12.836217403411865
  time_total_s: 298.87469720840454
  timestamp: 1625649980
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: d4cef_00009
  
Result for DEFAULT_d4cef_00009:
  accuracy: 0.97275
  date: 2021-07-07_10-26-33
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 17
  loss: 0.0971529632806778
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 312.1344618797302
  time_this_iter_s: 13.259764671325684
  time_total_s: 312.1344618797302
  timestamp: 1625649993
  timesteps_since_restore: 0
  training_iteration:

Result for DEFAULT_d4cef_00009:
  accuracy: 0.9740833333333333
  date: 2021-07-07_10-26-59
  done: false
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 19
  loss: 0.09059323370456696
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 337.9436159133911
  time_this_iter_s: 13.298288106918335
  time_total_s: 337.9436159133911
  timestamp: 1625650019
  timesteps_since_restore: 0
  training_iteration: 19
  trial_id: d4cef_00009
  
== Status ==
Memory usage on this node: 2.3/7.7 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 16.000: -0.0907982885837555 | Iter 8.000: -0.11808866262435913 | Iter 4.000: -0.14761251211166382 | Iter 2.000: -0.5671246349811554
Resources requested: 4.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_0_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0

2021-07-07 10:27:10,265	INFO tune.py:549 -- Total run time: 718.96 seconds (718.80 seconds for the tuning loop).


Result for DEFAULT_d4cef_00009:
  accuracy: 0.9736666666666667
  date: 2021-07-07_10-27-10
  done: true
  experiment_id: e361f3100f2b46448367d5bbf6fe0fd4
  hostname: chris-server
  iterations_since_restore: 20
  loss: 0.08877608180046082
  node_ip: 192.168.1.58
  pid: 155580
  should_checkpoint: true
  time_since_restore: 348.2736442089081
  time_this_iter_s: 10.330028295516968
  time_total_s: 348.2736442089081
  timestamp: 1625650030
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: d4cef_00009
  
== Status ==
Memory usage on this node: 1.9/7.7 GiB
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 16.000: -0.0907982885837555 | Iter 8.000: -0.11808866262435913 | Iter 4.000: -0.14761251211166382 | Iter 2.000: -0.5671246349811554
Resources requested: 2.0/8 CPUs, 0/0 GPUs, 0.0/3.97 GiB heap, 0.0/1.99 GiB objects (0.0/2.0 CPU_group_0_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_095b3f18f9ccb0d439ba4db6cd0d258e, 0.0/2.0 CPU_group_072cf17df9fdf3ff5298636eeb895304, 0.0

In [5]:
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

Best trial config: {'lr': 0.003973031165244418, 'batch_size': 32, 'p_d1': 0.28100305152407873, 'p_d2': 0.2205255724755377, 'h1': 1024, 'h2': 32}
Best trial final validation loss: 0.07955656945705414
Best trial final validation accuracy: 0.9778333333333333


In [6]:
best_trained_model = FFNN(784, 10, 
                          p_d1=best_trial.config['p_d1'], 
                          p_d2=best_trial.config['p_d2'], h1=best_trial.config['h'])

best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimiser_state = torch.load(osp.join(best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)
test_acc = test_accuracy(best_trained_model)
print("Best trial test set accuracy: {}".format(test_acc))

RuntimeError: Error(s) in loading state_dict for FFNN:
	size mismatch for nn.0.weight: copying a param with shape torch.Size([1024, 784]) from checkpoint, the shape in current model is torch.Size([64, 784]).
	size mismatch for nn.0.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for nn.3.weight: copying a param with shape torch.Size([32, 1024]) from checkpoint, the shape in current model is torch.Size([32, 64]).