In [3]:
!pip install ray[tune]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray[tune]
  Downloading ray-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl (59.4 MB)
[K     |████████████████████████████████| 59.4 MB 1.1 MB/s 
Collecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 42.2 MB/s 
Collecting virtualenv
  Downloading virtualenv-20.16.5-py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 48.8 MB/s 
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 75.7 MB/s 
Collecting distlib<1,>=0.3.5
  Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)
[K     |████████████████████████████████| 468 kB 65.8 MB/s 
[?25hCollecting platformdirs<3,>=2.4
  Downloading platformdirs-2.5.2-py3-none-any.whl (14 kB)
Installing collecte

In [54]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from filelock import FileLock
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.tune.schedulers import ASHAScheduler


In [55]:
def load_data(data_dir="./data"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    with FileLock(os.path.expanduser("~/.data.lock")):
        trainset = torchvision.datasets.CIFAR10(
            root=data_dir, train=True, download=True, transform=transform)

        testset = torchvision.datasets.CIFAR10(
            root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset

In [56]:
class Net(nn.Module):
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
        self.fc2 = nn.Linear(l1, l2)
        self.fc3 = nn.Linear(l2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [57]:
def train_cifar(config):
    net = Net(config["l1"], config["l2"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    # To restore a checkpoint, use `session.get_checkpoint()`.
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
           model_state, optimizer_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    data_dir = os.path.abspath("./data")
    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and can be accessed through `session.get_checkpoint()`
        # API in future iterations.
        os.makedirs("my_model", exist_ok=True)
        torch.save(
            (net.state_dict(), optimizer.state_dict()), "my_model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("my_model")
        session.report({"loss": (val_loss / val_steps), "accuracy": correct / total}, checkpoint=checkpoint)
    print("Finished Training")

In [58]:
def test_best_model(best_result):
    best_trained_model = Net(best_result.config["l1"], best_result.config["l2"])
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    best_trained_model.to(device)

    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")

    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = best_trained_model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()


    print("Best trial test set accuracy: {}".format(correct / total))

In [59]:
config = {
    "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
    "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([2, 4, 8, 16]),
}

In [72]:
num_samples=10
max_num_epochs=10
gpus_per_trial=1

config = {
    "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
    "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([2, 4, 8, 16])
}
scheduler = ASHAScheduler(
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2
    )
reporter = CLIReporter(
    # parameter_columns=["l1", "l2", "lr", "batch_size"],
    metric_columns=["loss", "accuracy", "training_iteration"])
   
tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train_cifar),
        resources={"cpu": 2, "gpu": gpus_per_trial}
    ),
    tune_config=tune.TuneConfig(
        metric="loss",
        mode="min",
        scheduler=scheduler,
        num_samples=num_samples,
            
    ),
    param_space=config
)
results = tuner.fit()
   
best_result = results.get_best_result("loss", "min")

print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
    best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_result.metrics["accuracy"]))

test_best_model(best_result)

Trial name,status,loc,batch_size,l1,l2,lr,iter,total time (s),loss,accuracy
train_cifar_13a1d_00000,TERMINATED,172.28.0.2:5760,16,16,32,0.000365922,10,218.574,1.33077,0.5217
train_cifar_13a1d_00001,TERMINATED,172.28.0.2:5760,8,256,16,0.0356145,1,37.0142,2.31333,0.1015
train_cifar_13a1d_00002,TERMINATED,172.28.0.2:5760,8,32,16,0.000423862,10,324.411,1.17589,0.587
train_cifar_13a1d_00003,TERMINATED,172.28.0.2:5760,8,256,128,0.00159949,10,323.686,1.19103,0.6181
train_cifar_13a1d_00004,TERMINATED,172.28.0.2:5760,16,64,32,0.0236986,2,49.2114,1.78205,0.3483
train_cifar_13a1d_00005,TERMINATED,172.28.0.2:5760,4,64,16,0.00117412,10,563.124,1.19332,0.5996
train_cifar_13a1d_00006,TERMINATED,172.28.0.2:5760,2,256,4,0.00361404,1,108.695,2.30432,0.1009
train_cifar_13a1d_00007,TERMINATED,172.28.0.2:5760,2,16,4,0.0492786,1,105.475,2.38906,0.0962
train_cifar_13a1d_00008,TERMINATED,172.28.0.2:5760,4,64,64,0.00099349,10,558.11,1.21816,0.5929
train_cifar_13a1d_00009,TERMINATED,172.28.0.2:5760,2,8,64,0.00172191,2,201.886,1.65788,0.3894


[2m[36m(train_cifar pid=5760)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00000_0_batch_size=16,l1=16,l2=32,lr=0.0004_2022-09-30_02-03-29/data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 229376/170498071 [00:00<01:33, 1815755.63it/s]
  1%|▏         | 2228224/170498071 [00:00<00:14, 11424426.46it/s]
  5%|▌         | 8814592/170498071 [00:00<00:04, 34781918.27it/s]
 10%|█         | 17334272/170498071 [00:00<00:02, 53685297.21it/s]
 14%|█▍        | 24510464/170498071 [00:00<00:02, 59923274.32it/s]
 19%|█▉        | 32931840/170498071 [00:00<00:02, 68023836.88it/s]
 23%|██▎       | 39976960/170498071 [00:00<00:01, 68631208.60it/s]
 29%|██▊       | 48726016/170498071 [00:00<00:01, 74490900.81it/s]
 33%|███▎      | 56229888/170498071 [00:00<00:01, 74232443.15it/s]
 38%|███▊      | 65110016/170498071 [00:01<00:01, 78621517.22it/s]
 43%|████▎     | 74055680/170498071 [00:01<00:01, 81880617.77it/s]
 49%|████▊     | 82739200/170498071 [00:01<00:01, 83357740.44it/s]
 54%|█████▎    | 91488256/170498071 [00:01<00:00, 84207084.21it/s]
 59%|█████▉    | 100368384/170498071 [00:01<00:00, 85533489.72it/s]
 64%|██████▍   | 109

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00000_0_batch_size=16,l1=16,l2=32,lr=0.0004_2022-09-30_02-03-29/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00000_0_batch_size=16,l1=16,l2=32,lr=0.0004_2022-09-30_02-03-29/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified


[2m[36m(train_cifar pid=5760)[0m   cpuset_checked))


[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.302
Result for train_cifar_13a1d_00000:
  accuracy: 0.1562
  date: 2022-09-30_02-04-03
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 1
  loss: 2.2803065170288086
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: true
  time_since_restore: 32.23836088180542
  time_this_iter_s: 32.23836088180542
  time_total_s: 32.23836088180542
  timestamp: 1664503443
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 13a1d_00000
  warmup_time: 0.013912439346313477
  
[2m[36m(train_cifar pid=5760)[0m [2,  2000] loss: 2.156
Result for train_cifar_13a1d_00000:
  accuracy: 0.2962
  date: 2022-09-30_02-04-23
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 2
  loss: 1.9187195610046386
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: true
  time_since_restore: 52.551509618759155
  time_this

[2m[36m(train_cifar pid=5760)[0m E0930 02:04:50.899957565    6176 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


[2m[36m(train_cifar pid=5760)[0m [4,  2000] loss: 1.658
Result for train_cifar_13a1d_00000:
  accuracy: 0.428
  date: 2022-09-30_02-05-04
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 4
  loss: 1.5814978337287904
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: true
  time_since_restore: 93.21935963630676
  time_this_iter_s: 20.464045524597168
  time_total_s: 93.21935963630676
  timestamp: 1664503504
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: 13a1d_00000
  warmup_time: 0.013912439346313477
  
[2m[36m(train_cifar pid=5760)[0m [5,  2000] loss: 1.550
Result for train_cifar_13a1d_00000:
  accuracy: 0.4518
  date: 2022-09-30_02-05-25
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 5
  loss: 1.503974890613556
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: true
  time_since_restore: 114.40159392356873
  time_this_

[2m[36m(train_cifar pid=5760)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 229376/170498071 [00:00<01:35, 1781668.85it/s]
  1%|          | 2097152/170498071 [00:00<00:15, 10667892.21it/s]
  6%|▌         | 9961472/170498071 [00:00<00:04, 39822908.59it/s]
 11%|█         | 18415616/170498071 [00:00<00:02, 56768304.83it/s]
 16%|█▌        | 27131904/170498071 [00:00<00:02, 67342898.99it/s]
 21%|██        | 35782656/170498071 [00:00<00:01, 73740572.00it/s]
 26%|██▌       | 44662784/170498071 [00:00<00:01, 78545897.89it/s]
 31%|███▏      | 53444608/170498071 [00:00<00:01, 81433171.50it/s]
 36%|███▋      | 62095360/170498071 [00:00<00:01, 82991092.18it/s]
 41%|████▏     | 70647808/170498071 [00:01<00:01, 83737403.70it/s]
 46%|████▋     | 79069184/170498071 [00:01<00:01, 82945215.02it/s]
 52%|█████▏    | 87982080/170498071 [00:01<00:00, 84786930.22it/s]
 57%|█████▋    | 97026048/170498071 [00:01<00:00, 86380129.77it/s]
 62%|██████▏   | 105676800/170498071 [00:01<00:00, 

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00001_1_batch_size=8,l1=256,l2=16,lr=0.0356_2022-09-30_02-07-09/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00001_1_batch_size=8,l1=256,l2=16,lr=0.0356_2022-09-30_02-07-09/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.280
[2m[36m(train_cifar pid=5760)[0m [1,  4000] loss: 1.156
Result for train_cifar_13a1d_00001:
  accuracy: 0.1015
  date: 2022-09-30_02-07-46
  done: true
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 1
  loss: 2.3133270238876342
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: true
  time_since_restore: 37.01415419578552
  time_this_iter_s: 37.01415419578552
  time_total_s: 37.01415419578552
  timestamp: 1664503666
  timesteps_since_restore: 0
  traini

[2m[36m(train_cifar pid=5760)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 131072/170498071 [00:00<02:17, 1242700.56it/s]
  1%|          | 950272/170498071 [00:00<00:32, 5191501.57it/s]
  2%|▏         | 3407872/170498071 [00:00<00:12, 13882561.95it/s]
  4%|▍         | 6455296/170498071 [00:00<00:08, 20295665.31it/s]
  5%|▍         | 8519680/170498071 [00:00<00:08, 19717499.02it/s]
  7%|▋         | 11370496/170498071 [00:00<00:07, 22529348.18it/s]
  8%|▊         | 13664256/170498071 [00:00<00:07, 21507691.43it/s]
  9%|▉         | 16023552/170498071 [00:00<00:06, 22133395.26it/s]
 11%|█         | 18808832/170498071 [00:00<00:06, 23804807.77it/s]
 13%|█▎        | 21823488/170498071 [00:01<00:05, 25691460.97it/s]
 15%|█▍        | 25264128/170498071 [00:01<00:05, 28291880.80it/s]
 16%|█▋        | 28114944/170498071 [00:01<00:05, 27983010.81it/s]
 18%|█▊        | 31031296/170498071 [00:01<00:04, 28186044.95it/s]
 20%|█▉        | 33914880/170498071 [00:01<00:04, 28272

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00002_2_batch_size=8,l1=32,l2=16,lr=0.0004_2022-09-30_02-07-46/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00002_2_batch_size=8,l1=32,l2=16,lr=0.0004_2022-09-30_02-07-46/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.300
[2m[36m(train_cifar pid=5760)[0m [1,  4000] loss: 1.118
Result for train_cifar_13a1d_00002:
  accuracy: 0.2662
  date: 2022-09-30_02-08-28
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 1
  loss: 2.057453628063202
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: true
  time_since_restore: 41.82619619369507
  time_this_iter_s: 41.82619619369507
  time_total_s: 41.82619619369507
  timestamp: 1664503708
  timesteps_since_restore: 0
  training

[2m[36m(train_cifar pid=5760)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 229376/170498071 [00:00<01:35, 1779036.46it/s]
  1%|          | 2097152/170498071 [00:00<00:15, 10648540.18it/s]
  6%|▌         | 9863168/170498071 [00:00<00:04, 39337064.52it/s]
 11%|█         | 19005440/170498071 [00:00<00:02, 59086611.40it/s]
 16%|█▌        | 27459584/170498071 [00:00<00:02, 68012684.28it/s]
 21%|██▏       | 36372480/170498071 [00:00<00:01, 75007029.68it/s]
 27%|██▋       | 45252608/170498071 [00:00<00:01, 79400905.03it/s]
 32%|███▏      | 54231040/170498071 [00:00<00:01, 82613215.41it/s]
 37%|███▋      | 63012864/170498071 [00:00<00:01, 84228383.21it/s]
 42%|████▏     | 71499776/170498071 [00:01<00:01, 83108160.93it/s]
 47%|████▋     | 80510976/170498071 [00:01<00:01, 85216668.24it/s]
 52%|█████▏    | 89391104/170498071 [00:01<00:00, 86225343.19it/s]
 58%|█████▊    | 98041856/170498071 [00:01<00:00, 86029973.70it/s]
 63%|██████▎   | 106659840/170498071 [00:01<00:00, 

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00003_3_batch_size=8,l1=256,l2=128,lr=0.0016_2022-09-30_02-13-11/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00003_3_batch_size=8,l1=256,l2=128,lr=0.0016_2022-09-30_02-13-11/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.005
[2m[36m(train_cifar pid=5760)[0m [1,  4000] loss: 0.816
Result for train_cifar_13a1d_00003:
  accuracy: 0.4696
  date: 2022-09-30_02-13-50
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 1
  loss: 1.4614509905338287
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: true
  time_since_restore: 39.64850664138794
  time_this_iter_s: 39.64850664138794
  time_total_s: 39.64850664138794
  timestamp: 1664504030
  timesteps_since_restore: 0
  tra

[2m[36m(train_cifar pid=5760)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 229376/170498071 [00:00<01:35, 1777734.68it/s]
  1%|          | 1966080/170498071 [00:00<00:16, 9967923.64it/s]
  5%|▌         | 9306112/170498071 [00:00<00:04, 37120987.73it/s]
 10%|▉         | 16646144/170498071 [00:00<00:03, 50874745.17it/s]
 19%|█▉        | 32309248/170498071 [00:00<00:02, 65771551.63it/s]
 24%|██▍       | 40632320/170498071 [00:00<00:01, 71278793.87it/s]
 29%|██▉       | 49381376/170498071 [00:00<00:01, 75911034.20it/s]
 33%|███▎      | 57049088/170498071 [00:00<00:01, 72463298.27it/s]
 38%|███▊      | 64389120/170498071 [00:01<00:01, 72524532.00it/s]
 42%|████▏     | 71696384/170498071 [00:01<00:01, 68619906.85it/s]
 51%|█████     | 87326720/170498071 [00:01<00:01, 72896151.31it/s]
 56%|█████▌    | 94666752/170498071 [00:01<00:01, 69615228.27it/s]
 60%|██████    | 102858752/170498071 [00:01<00:00, 73015668.48it/s]
 65%|██████▍   | 110231552/170498071 [00:01<00:00, 

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00004_4_batch_size=16,l1=64,l2=32,lr=0.0237_2022-09-30_02-18-34/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00004_4_batch_size=16,l1=64,l2=32,lr=0.0237_2022-09-30_02-18-34/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 1.921
Result for train_cifar_13a1d_00004:
  accuracy: 0.3109
  date: 2022-09-30_02-19-04
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 1
  loss: 1.8886317911148072
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: true
  time_since_restore: 29.277180910110474
  time_this_iter_s: 29.277180910110474
  time_total_s: 29.277180910110474
  timestamp: 1664504344
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 13a1d_00004
  warmup_time: 

[2m[36m(train_cifar pid=5760)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 131072/170498071 [00:00<02:11, 1299975.68it/s]
  1%|          | 917504/170498071 [00:00<00:33, 5076721.42it/s]
  4%|▍         | 6914048/170498071 [00:00<00:05, 29736226.42it/s]
  8%|▊         | 13762560/170498071 [00:00<00:03, 44821450.59it/s]
 13%|█▎        | 22249472/170498071 [00:00<00:02, 58879554.49it/s]
 17%|█▋        | 29622272/170498071 [00:00<00:02, 63848915.16it/s]
 22%|██▏       | 37027840/170498071 [00:00<00:01, 67121454.84it/s]
 27%|██▋       | 45219840/170498071 [00:00<00:01, 71747704.97it/s]
 31%|███       | 52428800/170498071 [00:00<00:01, 69981544.60it/s]
 36%|███▌      | 60915712/170498071 [00:01<00:01, 74433657.42it/s]
 40%|████      | 68386816/170498071 [00:01<00:01, 73774169.47it/s]
 45%|████▍     | 76054528/170498071 [00:01<00:01, 74595925.15it/s]
 49%|████▉     | 84082688/170498071 [00:01<00:01, 76258650.90it/s]
 54%|█████▍    | 91717632/170498071 [00:01<00:01, 726

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00005_5_batch_size=4,l1=64,l2=16,lr=0.0012_2022-09-30_02-19-24/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00005_5_batch_size=4,l1=64,l2=16,lr=0.0012_2022-09-30_02-19-24/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.299
[2m[36m(train_cifar pid=5760)[0m [1,  4000] loss: 1.014
[2m[36m(train_cifar pid=5760)[0m [1,  6000] loss: 0.598
[2m[36m(train_cifar pid=5760)[0m [1,  8000] loss: 0.417
[2m[36m(train_cifar pid=5760)[0m [1, 10000] loss: 0.318
Result for train_cifar_13a1d_00005:
  accuracy: 0.4416
  date: 2022-09-30_02-20-23
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 1
  loss: 1.5380751318335533
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: tr

[2m[36m(train_cifar pid=5760)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 131072/170498071 [00:00<02:15, 1258160.65it/s]
  1%|          | 1048576/170498071 [00:00<00:29, 5828123.23it/s]
  2%|▏         | 4259840/170498071 [00:00<00:09, 17680906.84it/s]
  5%|▍         | 8388608/170498071 [00:00<00:06, 26776784.75it/s]
  6%|▋         | 11075584/170498071 [00:00<00:06, 26517139.60it/s]
  9%|▉         | 14974976/170498071 [00:00<00:05, 30676092.38it/s]
 11%|█         | 18055168/170498071 [00:00<00:05, 29495977.62it/s]
 13%|█▎        | 21987328/170498071 [00:00<00:04, 32461474.56it/s]
 15%|█▍        | 25264128/170498071 [00:00<00:04, 32118274.84it/s]
 17%|█▋        | 28901376/170498071 [00:01<00:04, 33304018.75it/s]
 19%|█▉        | 32243712/170498071 [00:01<00:04, 33141563.61it/s]
 21%|██▏       | 36503552/170498071 [00:01<00:03, 35959275.90it/s]
 24%|██▎       | 40402944/170498071 [00:01<00:03, 36775834.62it/s]
 26%|██▌       | 44728320/170498071 [00:01<00:03, 385

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00006_6_batch_size=2,l1=256,l2=4,lr=0.0036_2022-09-30_02-28-47/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00006_6_batch_size=2,l1=256,l2=4,lr=0.0036_2022-09-30_02-28-47/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.307
[2m[36m(train_cifar pid=5760)[0m [1,  4000] loss: 1.154
[2m[36m(train_cifar pid=5760)[0m [1,  6000] loss: 0.769
[2m[36m(train_cifar pid=5760)[0m [1,  8000] loss: 0.577
[2m[36m(train_cifar pid=5760)[0m [1, 10000] loss: 0.461
[2m[36m(train_cifar pid=5760)[0m [1, 12000] loss: 0.384
[2m[36m(train_cifar pid=5760)[0m [1, 14000] loss: 0.329
[2m[36m(train_cifar pid=5760)[0m [1, 16000] loss: 0.288
[2m[36m(train_cifar pid=5760)[0m [1, 18000] loss: 0.256
[2m[36m(train_cifar pid=5760)[0m [1, 20000] los

[2m[36m(train_cifar pid=5760)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 196608/170498071 [00:00<01:26, 1964091.70it/s]
  1%|          | 1245184/170498071 [00:00<00:24, 6972143.34it/s]
  4%|▍         | 6815744/170498071 [00:00<00:05, 29197399.00it/s]
  9%|▉         | 15794176/170498071 [00:00<00:02, 53051909.08it/s]
 14%|█▍        | 24543232/170498071 [00:00<00:02, 65417394.22it/s]
 20%|█▉        | 33456128/170498071 [00:00<00:01, 73437006.84it/s]
 25%|██▍       | 42172416/170498071 [00:00<00:01, 77841832.96it/s]
 30%|██▉       | 50855936/170498071 [00:00<00:01, 80551675.82it/s]
 35%|███▍      | 59277312/170498071 [00:00<00:01, 81681950.70it/s]
 40%|███▉      | 68124672/170498071 [00:01<00:01, 83700345.44it/s]
 45%|████▍     | 76677120/170498071 [00:01<00:01, 83936867.67it/s]
 50%|█████     | 85786624/170498071 [00:01<00:00, 86018359.15it/s]
 55%|█████▌    | 94404608/170498071 [00:01<00:00, 84041790.45it/s]
 61%|██████    | 103284736/170498071 [00:01<00:00, 8

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00007_7_batch_size=2,l1=16,l2=4,lr=0.0493_2022-09-30_02-30-36/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00007_7_batch_size=2,l1=16,l2=4,lr=0.0493_2022-09-30_02-30-36/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.359
[2m[36m(train_cifar pid=5760)[0m [1,  4000] loss: 1.180
[2m[36m(train_cifar pid=5760)[0m [1,  6000] loss: 0.784
[2m[36m(train_cifar pid=5760)[0m [1,  8000] loss: 0.591
[2m[36m(train_cifar pid=5760)[0m [1, 10000] loss: 0.472
[2m[36m(train_cifar pid=5760)[0m [1, 12000] loss: 0.394
[2m[36m(train_cifar pid=5760)[0m [1, 14000] loss: 0.337
[2m[36m(train_cifar pid=5760)[0m [1, 16000] loss: 0.295
[2m[36m(train_cifar pid=5760)[0m [1, 18000] loss: 0.262
[2m[36m(train_cifar pid=5760)[0m [1, 20000] loss:

[2m[36m(train_cifar pid=5760)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 131072/170498071 [00:00<02:11, 1291816.62it/s]
  1%|          | 950272/170498071 [00:00<00:32, 5293564.72it/s]
  1%|▏         | 2326528/170498071 [00:00<00:18, 9065103.82it/s]
  2%|▏         | 3932160/170498071 [00:00<00:14, 11684257.94it/s]
  3%|▎         | 5668864/170498071 [00:00<00:12, 13594148.22it/s]
  5%|▍         | 7733248/170498071 [00:00<00:10, 15897841.80it/s]
  6%|▌         | 10059776/170498071 [00:00<00:08, 18248466.38it/s]
  7%|▋         | 12550144/170498071 [00:00<00:07, 20277506.45it/s]
  9%|▉         | 15106048/170498071 [00:00<00:07, 21867705.38it/s]
 10%|█         | 17301504/170498071 [00:01<00:07, 21870135.84it/s]
 11%|█▏        | 19595264/170498071 [00:01<00:06, 22071928.04it/s]
 13%|█▎        | 21823488/170498071 [00:01<00:06, 22091199.31it/s]
 14%|█▍        | 24117248/170498071 [00:01<00:06, 22314261.48it/s]
 15%|█▌        | 26411008/170498071 [00:01<00:06, 2246226

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00008_8_batch_size=4,l1=64,l2=64,lr=0.0010_2022-09-30_02-32-21/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00008_8_batch_size=4,l1=64,l2=64,lr=0.0010_2022-09-30_02-32-21/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.206
[2m[36m(train_cifar pid=5760)[0m [1,  4000] loss: 0.915
[2m[36m(train_cifar pid=5760)[0m [1,  6000] loss: 0.549
[2m[36m(train_cifar pid=5760)[0m [1,  8000] loss: 0.389
[2m[36m(train_cifar pid=5760)[0m [1, 10000] loss: 0.300
Result for train_cifar_13a1d_00008:
  accuracy: 0.4526
  date: 2022-09-30_02-33-25
  done: false
  experiment_id: 73c5e13b7be242449a3cb20da1539a18
  hostname: a758025fc9b4
  iterations_since_restore: 1
  loss: 1.4963509201049805
  node_ip: 172.28.0.2
  pid: 5760
  should_checkpoint: tr

  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 229376/170498071 [00:00<01:34, 1800575.46it/s]
  1%|▏         | 2228224/170498071 [00:00<00:14, 11312331.21it/s]
  6%|▌         | 10387456/170498071 [00:00<00:03, 41037591.71it/s]
 12%|█▏        | 19628032/170498071 [00:00<00:02, 60387998.49it/s]
 17%|█▋        | 28377088/170498071 [00:00<00:02, 69744411.11it/s]
 22%|██▏       | 37552128/170498071 [00:00<00:01, 77029248.90it/s]
 27%|██▋       | 46333952/170498071 [00:00<00:01, 80172364.94it/s]
 32%|███▏      | 55312384/170498071 [00:00<00:01, 83086181.79it/s]
 38%|███▊      | 64061440/170498071 [00:00<00:01, 83361122.58it/s]
 43%|████▎     | 73367552/170498071 [00:01<00:01, 86207313.08it/s]
 48%|████▊     | 82116608/170498071 [00:01<00:01, 86563845.69it/s]
 53%|█████▎    | 90800128/170498071 [00:01<00:00, 85315398.79it/s]
 58%|█████▊    | 99647488/170498071 [00:01<00:00, 86183257.47it/s]
 64%|██████▎   | 108593152/170498071 [00:01<00:00, 87152976.09it/s]
 69%|██████▉   | 11

[2m[36m(train_cifar pid=5760)[0m Extracting /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00009_9_batch_size=2,l1=8,l2=64,lr=0.0017_2022-09-30_02-41-39/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2022-09-30_02-03-28/train_cifar_13a1d_00009_9_batch_size=2,l1=8,l2=64,lr=0.0017_2022-09-30_02-41-39/data
[2m[36m(train_cifar pid=5760)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=5760)[0m [1,  2000] loss: 2.201
[2m[36m(train_cifar pid=5760)[0m [1,  4000] loss: 0.966
[2m[36m(train_cifar pid=5760)[0m [1,  6000] loss: 0.616
[2m[36m(train_cifar pid=5760)[0m [1,  8000] loss: 0.449
[2m[36m(train_cifar pid=5760)[0m [1, 10000] loss: 0.353
[2m[36m(train_cifar pid=5760)[0m [1, 12000] loss: 0.286
[2m[36m(train_cifar pid=5760)[0m [1, 14000] loss: 0.246
[2m[36m(train_cifar pid=5760)[0m [1, 16000] loss: 0.214
[2m[36m(train_cifar pid=5760)[0m [1, 18000] loss: 0.188
[2m[36m(train_cifar pid=5760)[0m [1, 20000] loss:

2022-09-30 02:45:01,703	INFO tune.py:759 -- Total run time: 2492.85 seconds (2492.70 seconds for the tuning loop).


Best trial config: {'l1': 32, 'l2': 16, 'lr': 0.00042386219608474783, 'batch_size': 8}
Best trial final validation loss: 1.175886014330387
Best trial final validation accuracy: 0.587
Files already downloaded and verified
Files already downloaded and verified
Best trial test set accuracy: 0.5855
