# Second iteration

Let's treat PBT as an advisor to whatever training loop one is using.

In [2]:
import abc
import typing
import numpy as np

class PbtAdvisor(abc.ABC):
    @abc.abstractmethod
    def advise(self, *, performance: np.ndarray, **kwargs) -> typing.Dict[int, int]:
        pass
    

The idea is to have a advisor that takes in performance metrics and returns a map from destination to source that specifies which models and hyperparameters to copy. Then the question is how and when to decide when to copy something.

In [3]:
class DeviationPbtAdvisor(PbtAdvisor):
    def __init__(self, max_lower_deviation=1):
        self.max_lower_deviation = max_lower_deviation
        
    def advise(self, *, performance: np.ndarray, **kwargs) -> typing.Dict:
        performance = np.array(performance)
        
        stddev = np.std(performance, ddof=1)
        mean = np.mean(performance)
        
        # If we call this too often in a row without training in-between,
        # it will just copy the best performer everywhere.
        underperformers = performance < mean - self.max_lower_deviation * stddev
        indices = np.transpose(np.nonzero(underperformers))
        best_performer = np.unravel_index(np.argmax(performance),
                                          dims=performance.shape)

        return {tuple(index): tuple(best_performer) for index in indices}

In [4]:
def test_deviation_pbt_advisor():
    advisor = DeviationPbtAdvisor()
    print(advisor.advise(performance=[1,0,5,6,7,7]))
    
test_deviation_pbt_advisor()

{(0,): (4,), (1,): (4,)}


In [5]:
def test_tuple_as_indices():
    a = [1,2,3]
    a[(1,)[0]]

test_tuple_as_indices()

In [6]:
class AboutEveryNAdvisor(PbtAdvisor):
    def __init__(self, inner_advisor: PbtAdvisor, next_n: typing.Callable):
        self.inner_advisor = inner_advisor
        self.next_n = next_n
        self.steps_left = next_n()
        
    def advise(self, *, performance: np.ndarray, **kwargs) -> typing.Dict:
        self.steps_left -= 1
        if self.steps_left <= 0:
            self.steps_left = self.next_n()
            return self.inner_advisor.advise(performance=performance, **kwargs)
        return {}       

## Critique

* There needs to be a separation between performance and the underperformer predicate.
  With the current API, the only way to implement varying steps_left is to filter the inner_advisor's advice dict every step (and it will produce actions every step for sure)
  
Another way to think of what we are doing here is to run lots of experiments in parallel and for each experiment we try to establish if it is the most promising.

In [7]:
# From pytorch examples
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

In [8]:
import tqdm
import numpy as np

In [9]:
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                    help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                    help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                    help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
args = parser.parse_args(args=[])
args.cuda = not args.no_cuda and torch.cuda.is_available()

In [10]:
import copy

In [11]:
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
    batch_size=args.batch_size,
    shuffle=True,
    **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
    batch_size=args.test_batch_size,
    shuffle=True,
    **kwargs)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)


class Worker(typing.NamedTuple):
    model: Net
    optimizer: optim.Optimizer

    @staticmethod
    def create():
        model = Net()
        if args.cuda:
            model.cuda()
        optimizer = optim.SGD(
            model.parameters(), lr=args.lr, momentum=args.momentum)
        return Worker(model, optimizer)
    
    def perturb_learning_rate(self):
        new_lr_factor = 10**np.random.normal(scale=1.0)
        new_momentum_delta = np.random.normal(scale=0.1)
        for param_group in self.optimizer.param_groups:
            param_group['lr'] *= new_lr_factor
            param_group['momentum'] += new_momentum_delta

    def clone(self):
        new_worker = self.create()
        state_dict = self.model.state_dict()
        new_worker.model.load_state_dict(state_dict)
        state_dict = self.optimizer.state_dict()
        new_worker.optimizer.load_state_dict(state_dict)
        return new_worker

In [12]:
def train(model, optimizer, epoch):
    model.train()
    progress_iterable = tqdm.tqdm_notebook(train_loader)
    for batch_idx, (data, target) in enumerate(progress_iterable):
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            progress_iterable.set_description('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data[0]))

def test(model):
    model.eval()
    test_loss = 0
    correct = 0
    progress_iterable = tqdm.tqdm_notebook(test_loader)
    for data, target in progress_iterable:
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return test_loss

pbt_advisor = DeviationPbtAdvisor()

num_workers = 5
workers = [Worker.create() for _ in range(num_workers)]
for worker in workers:
    worker.perturb_learning_rate()
    
for epoch in range(1, args.epochs + 1):
    performances = []
    for idx, worker in enumerate(workers):
        print('Worker %s' % idx)
        train(worker.model, worker.optimizer, epoch)
        performance = -test(worker.model)
        performances.append(performance)
        
    copy_actions = pbt_advisor.advise(performance=performances)
    for dst, src in copy_actions.items():
        workers[dst[0]] = workers[src[0]].clone()
        workers[dst[0]].perturb_learning_rate()
        print('Copying %s to %s' % (src, dst))

Worker 0









Test set: Average loss: 0.3163, Accuracy: 9122/10000 (91%)

Worker 1







Test set: Average loss: 2.0927, Accuracy: 4940/10000 (49%)

Worker 2







Test set: Average loss: 0.1665, Accuracy: 9482/10000 (95%)

Worker 3







Test set: Average loss: 2.2966, Accuracy: 1735/10000 (17%)

Worker 4







Test set: Average loss: 0.1673, Accuracy: 9500/10000 (95%)

Copying (2,) to (3,)
Worker 0







Test set: Average loss: 0.1893, Accuracy: 9415/10000 (94%)

Worker 1







Test set: Average loss: 0.9599, Accuracy: 7995/10000 (80%)

Worker 2







Test set: Average loss: 0.1074, Accuracy: 9680/10000 (97%)

Worker 3







Test set: Average loss: 0.1579, Accuracy: 9507/10000 (95%)

Worker 4







Test set: Average loss: 0.1040, Accuracy: 9669/10000 (97%)

Copying (4,) to (1,)
Worker 0







Test set: Average loss: 0.1436, Accuracy: 9550/10000 (96%)

Worker 1







Test set: Average loss: 0.0923, Accuracy: 9715/10000 (97%)

Worker 2







Test set: Average loss: 0.0871, Accuracy: 9726/10000 (97%)

Worker 3







Test set: Average loss: 0.1556, Accuracy: 9513/10000 (95%)

Worker 4







Test set: Average loss: 0.0851, Accuracy: 9732/10000 (97%)

Copying (4,) to (3,)
Worker 0







Test set: Average loss: 0.1200, Accuracy: 9638/10000 (96%)

Worker 1







Test set: Average loss: 0.0860, Accuracy: 9723/10000 (97%)

Worker 2







Test set: Average loss: 0.0745, Accuracy: 9753/10000 (98%)

Worker 3







Test set: Average loss: 0.0784, Accuracy: 9754/10000 (98%)

Worker 4







Test set: Average loss: 0.0761, Accuracy: 9770/10000 (98%)

Copying (2,) to (0,)
Worker 0







Test set: Average loss: 0.0721, Accuracy: 9763/10000 (98%)

Worker 1







Test set: Average loss: 0.0822, Accuracy: 9739/10000 (97%)

Worker 2







Test set: Average loss: 0.0688, Accuracy: 9778/10000 (98%)

Worker 3







Test set: Average loss: 0.0619, Accuracy: 9794/10000 (98%)

Worker 4







Test set: Average loss: 0.0663, Accuracy: 9798/10000 (98%)

Copying (3,) to (1,)
Worker 0







Test set: Average loss: 0.0713, Accuracy: 9769/10000 (98%)

Worker 1







Test set: Average loss: 0.0854, Accuracy: 9740/10000 (97%)

Worker 2







Test set: Average loss: 0.0620, Accuracy: 9786/10000 (98%)

Worker 3







Test set: Average loss: 0.0587, Accuracy: 9823/10000 (98%)

Worker 4







Test set: Average loss: 0.0633, Accuracy: 9799/10000 (98%)

Copying (3,) to (1,)
Worker 0







Test set: Average loss: 0.0708, Accuracy: 9771/10000 (98%)

Worker 1







Test set: Average loss: 0.0690, Accuracy: 9784/10000 (98%)

Worker 2







Test set: Average loss: 0.0592, Accuracy: 9802/10000 (98%)

Worker 3







Test set: Average loss: 0.0515, Accuracy: 9843/10000 (98%)

Worker 4







Test set: Average loss: 0.0613, Accuracy: 9817/10000 (98%)

Copying (3,) to (0,)
Worker 0







Test set: Average loss: 0.0498, Accuracy: 9852/10000 (99%)

Worker 1







Test set: Average loss: 0.0645, Accuracy: 9803/10000 (98%)

Worker 2







Test set: Average loss: 0.0571, Accuracy: 9815/10000 (98%)

Worker 3







Test set: Average loss: 0.0500, Accuracy: 9842/10000 (98%)

Worker 4







Test set: Average loss: 0.0558, Accuracy: 9826/10000 (98%)

Copying (0,) to (1,)
Worker 0







Test set: Average loss: 0.0471, Accuracy: 9855/10000 (99%)

Worker 1







Test set: Average loss: 0.0483, Accuracy: 9850/10000 (98%)

Worker 2







Test set: Average loss: 0.0494, Accuracy: 9834/10000 (98%)

Worker 3







Test set: Average loss: 0.0498, Accuracy: 9847/10000 (98%)

Worker 4







Test set: Average loss: 0.0531, Accuracy: 9837/10000 (98%)

Copying (0,) to (4,)
Worker 0







Test set: Average loss: 0.0474, Accuracy: 9854/10000 (99%)

Worker 1







Test set: Average loss: 0.0481, Accuracy: 9854/10000 (99%)

Worker 2







Test set: Average loss: 0.0487, Accuracy: 9827/10000 (98%)

Worker 3







Test set: Average loss: 0.0443, Accuracy: 9861/10000 (99%)

Worker 4







Test set: Average loss: 0.0472, Accuracy: 9853/10000 (99%)



## MNIST code critique

I'm using the test set as validation set, so I'm overfitting slowly.