## 1. Importing necessary packages

In [1]:
# Basic python packages
import numpy as np
import pandas as pd
import os, math, time, pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
from operator import itemgetter
from IPython.display import clear_output

In [2]:
# Pytorch packages
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split
from torchvision import models, datasets, transforms
device = torch.device('cuda')
torch.cuda.device_count()

  from .autonotebook import tqdm as notebook_tqdm


6

## 2. Training, validation, and test datasets

In [3]:
# Set up train and validation datasets
norm_stats = ((0.5071, 0.4866, 0.4409),(0.2009, 0.1984, 0.2023)) # CIFAR100 training set normalization constants
R = 384
train_transform = transforms.Compose([
    transforms.AutoAugment(policy = transforms.autoaugment.AutoAugmentPolicy.CIFAR10),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop(R),
    transforms.ToTensor(), # Also standardizes to range [0,1]
    transforms.Normalize(*norm_stats),
])

valid_transform = transforms.Compose([
    transforms.Resize(R),
    transforms.ToTensor(), # Also standardizes to range [0,1]
    transforms.Normalize(*norm_stats),
])

ROOT = '/data/cifar100'
train_dataset = datasets.CIFAR100(root=ROOT, train=True, transform=train_transform, download=True)

# Hold-out this data for final evaluation
valid_dataset = datasets.CIFAR100(root=ROOT, train=False, transform=valid_transform, download=True)

print(f'Train: {len(train_dataset):,.0f}, Valid: {len(valid_dataset):,.0f}')

Files already downloaded and verified
Files already downloaded and verified
Train: 50,000, Valid: 10,000


## 3. Run training and evaluation routine.

In [4]:
# DDP training routine inputs
model_type = 'effnetv2'
world_size = 6
time_budget_mins = np.inf # minutes per trial
nepochs = 250
batch_size = 25
accumulate = 4
evaluate = True
saving = 'final'

# Import or define model parameters
model_parameters = {"output_size": 100}

%run -i bte_ddp.py

with open(f'experiment_results/{model_type}_result.pkl', 'rb') as handle:
    results = pickle.load(handle)
    
# Unpack results object
avg_epoch_train_time = [results[e]["time"] for e in results]
train_loss_epoch = np.array([results[e]["tloss"] for e in results])
valid_loss_epoch = np.array([results[e]["vloss"] for e in results])
valid_top1accu_epoch = np.array([results[e]["top1"] for e in results])
valid_top5accu_epoch = np.array([results[e]["top5"] for e in results])

target = valid_top1accu_epoch.max()

# Save trial result measures to disk for inspection later.
payload = {
    'R': R,
    'time_budget_mins': time_budget_mins,
    'params': model_parameters,
    'tloss':train_loss_epoch,
    'vloss':valid_loss_epoch,
    'top1': valid_top1accu_epoch,
    'top5': valid_top5accu_epoch,
    'time': avg_epoch_train_time
}

with open(f'experiment_results/EffNetV2_results.pkl', 'wb') as handle:
    pickle.dump(payload, handle)

# Need to delete this because otherwise failed training is skipped and the file from last run is picked up instead.
os.remove(f'experiment_results/{model_type}_result.pkl')

Generated effnetv2 model with 20,305,588 params.
EPOCH 0, TLOSS 4.449, VLOSS 4.120, TOP1 6.14, TOP5 22.60, TIME 92.494
EPOCH 1, TLOSS 4.246, VLOSS 3.934, TOP1 8.61, TOP5 29.02, TIME 90.677
EPOCH 2, TLOSS 4.112, VLOSS 3.751, TOP1 10.91, TOP5 33.70, TIME 90.634
EPOCH 3, TLOSS 3.910, VLOSS 3.533, TOP1 15.30, TOP5 39.85, TIME 90.822
EPOCH 4, TLOSS 3.686, VLOSS 3.106, TOP1 21.77, TOP5 51.74, TIME 90.593
EPOCH 5, TLOSS 3.481, VLOSS 2.948, TOP1 25.07, TOP5 56.38, TIME 90.990
EPOCH 6, TLOSS 3.275, VLOSS 2.651, TOP1 31.27, TOP5 63.75, TIME 91.089
EPOCH 7, TLOSS 3.104, VLOSS 2.351, TOP1 37.53, TOP5 70.56, TIME 90.890
EPOCH 8, TLOSS 2.958, VLOSS 2.165, TOP1 41.18, TOP5 74.27, TIME 91.117
EPOCH 9, TLOSS 2.850, VLOSS 2.090, TOP1 43.11, TOP5 76.18, TIME 90.968
EPOCH 10, TLOSS 2.721, VLOSS 1.864, TOP1 48.87, TOP5 79.90, TIME 91.210
EPOCH 11, TLOSS 2.631, VLOSS 1.817, TOP1 49.70, TOP5 80.81, TIME 91.301
EPOCH 12, TLOSS 2.531, VLOSS 1.668, TOP1 52.84, TOP5 83.57, TIME 91.139
EPOCH 13, TLOSS 2.467, VLOS