## 1. Importing necessary packages

In [1]:
# Basic python packages
import numpy as np
import pandas as pd
import os, math, time, pickle
from tqdm import tqdm
from operator import itemgetter
from IPython.display import clear_output

In [2]:
# Computer vision packages
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split
from torchvision import models, datasets, transforms
from components import BetaNet, BetaNet3
device = torch.device('cuda')
print(torch.cuda.device_count())

  from .autonotebook import tqdm as notebook_tqdm


6


In [None]:
# Ax experimentation and Bayesian Optimisation packages
import ax
from ax.service.ax_client import AxClient
from ax.modelbridge.generation_strategy import GenerationStrategy, GenerationStep
from ax.modelbridge.registry import Models, ModelRegistryBase

## 2. Mean and standard deviation of training data

In [4]:
# ROOT = '/data/cifar100'
# train_dataset = datasets.CIFAR100(root=ROOT, train=True, transform=None, download=True)
# tfm = transforms.ToTensor()

# means = [tfm(x_y[0]).mean(axis=[1,2]) for x_y in train_dataset]
# means = sum(means) / len(train_dataset)

# stds = [tfm(x_y[0]).std(axis=[1,2]) for x_y in train_dataset]
# stds = sum(stds) / len(train_dataset)

# norm_stats = (means, stds)
# norm_stats = ((0.5071, 0.4866, 0.4409),(0.2009, 0.1984, 0.2023))

## 3. Global training parameters

In [3]:
# Global params
world_size = 6
model_type = 'beta2' # / beta3, refer "model_builder" function in bte_ddp.py for options 
R = 224
output_size = 100
time_budget_mins = 5 # minutes per trial
nepochs = 300 # More than we'll need in 5 minutes
batch_size = 100 # For world size of 6, effective batch size 600
accumulate = 1 # Accumulate batch gradients to increase effective batch size if necessary
evaluate = True # Whether to evaluate on valid_dataset after each epoch
saving = 'none' # Saving model after 'best' evaluation on valid dataset, after 'final' epoch or 'none'.

## 4. Training, validation, and test datasets

In [5]:
# Set up train and validation datasets
norm_stats = ((0.5071, 0.4866, 0.4409),(0.2009, 0.1984, 0.2023)) # CIFAR100 training set normalization constants
train_transform = transforms.Compose([
    transforms.AutoAugment(policy = transforms.autoaugment.AutoAugmentPolicy.CIFAR10),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop(R),
    transforms.ToTensor(),
    transforms.Normalize(*norm_stats),
])

valid_transform = transforms.Compose([
    transforms.Resize(R),
    transforms.ToTensor(),
    transforms.Normalize(*norm_stats),
])

ROOT = '/data/cifar100'
# Training and validation dataset initialized from "training" split.
train_dataset = datasets.CIFAR100(root=ROOT, train=True, transform=train_transform, download=True)
valid_dataset = datasets.CIFAR100(root=ROOT, train=True, transform=valid_transform, download=True)

# Hold-out test dataset
test_dataset = datasets.CIFAR100(root=ROOT, train=False, transform=valid_transform, download=True)

# Split training dataset into training and validation
train_idx, val_idx = train_test_split(range(len(train_dataset)), test_size=0.2, random_state=308184653)
train_dataset = torch.utils.data.Subset(train_dataset, train_idx)
valid_dataset = torch.utils.data.Subset(valid_dataset, val_idx)

print(f'Train: {len(train_dataset):,.0f}, Valid: {len(valid_dataset):,.0f}, Test: {len(test_dataset):,.0f}')

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Train: 40,000, Valid: 10,000, Test: 10,000


## 5. Define Ax search space, constrains, and objective function

In [6]:
import ax
from ax.service.ax_client import AxClient

parameter_space = [
    {"name": "k", "type": "range", "bounds": [2, 4], "value_type": "float"}, # log2(max remaining spatial extent at final conv layers)
    {"name": "K", "type": "range", "bounds": [3, 5], "value_type": "float"}, # Model depth scaler
    {"name": "Da", "type": "range", "bounds": [-0.5, 0.5], "value_type": "float"}, # Governs distribution of spatial contractions
    {"name": "Db", "type": "range", "bounds": [-0.5, 0.5], "value_type": "float"}, # Governs distribution of spatial contractions
    {"name": "Wa", "type": "range", "bounds": [-0.2, 0.5], "value_type": "float"}, # Governs n_channels as function of spatial resolution
    {"name": "Wb", "type": "range", "bounds": [-0.5, 0.5], "value_type": "float"}, # Governs n_channels as function of spatial resolution
    {"name": "G0", "type": "range", "bounds": [2, 4], "value_type": "float"}, # Output channels multiple of R
    {"name": "G1", "type": "range", "bounds": [1, 3], "value_type": "float"}, # Latent vector size multiple of output channels
    {"name": "B", "type": "range", "bounds": [0, 1], "value_type": "float"}, # Depth at which transition from Fused-MBConv to MBConv
    {"name": "E0", "type": "range", "bounds": [2, 6], "value_type": "float"}, # Starting MBConv expansion ratio
    {"name": "E1", "type": "range", "bounds": [2, 6], "value_type": "float"}, # Ending MBConv expansion ratio
    {"name": "S0", "type": "range", "bounds": [10, 30], "value_type": "float"}, # Starting Squeeze-excitation bottleneck ratio
    {"name": "S1", "type": "range", "bounds": [10, 30], "value_type": "float"}, # Ending Squeeze-excitation bottleneck ratio
    {"name": "SD0", "type": "fixed", "value": 0.0, "value_type": "float"}, # Stochastic depth starting percent
    {"name": "SD1", "type": "fixed", "value": 0.2, "value_type": "float"}, # Stochastic depth ending percent
    {"name": "dp", "type": "fixed", "value": 0.2, "value_type": "float"}, # Classifier dropout percent
] 

# constraints = [
#     "SD0 <= SD1"
# ]

# Define trial evaluation function
def objective_function(
    trial_index, 
    world_size, 
    model_type,
    model_parameters, 
    time_budget_mins, 
    nepochs, 
    batch_size,
    accumulate,
    train_dataset,
    valid_dataset,
    evaluate,
    saving,
    dest_dir
):
    
    # 
    '''
    Python script bte_ddp.py accesses notebook name space with global parameters
     - model_type           # type of model to build
     - model_parameters     # parameter dictionary specific to model type
     - world_size           # number of GPUs to train on
     - time_budget_mins     # training time budget
     - nepochs              # max epochs of training
     - batch_size           # the number of examples included in each forward pass. Typically set to max possible on a GPU
     - accumulate           # the number of forward passes to accumulate in each update step
     - train_dataset        # training dataset
     - valid_dataset        # validation dataset
     - evaluate             # boolean indicating whether or not to evaluate the model each epoch on a validation set.
     - saving               # save model after each epoch the model improves on the validation set if 'best' or at the end of N epochs if 'final'.
     
    bte_ddp.py also assumes existence of a destination directory "experiment_results" for results to be saved.
    '''
    %run -i bte_ddp.py
    
    with open(f'experiment_results/{model_type}_result.pkl', 'rb') as handle:
        results = pickle.load(handle)
    
    """
    results object saved by bte_ddp.py takes the following structure.
    results[epoch] = {
            "tloss": tloss_,
            "vloss": vloss_,
            "top1": top1_,
            "top5": top5_,
            "time": epoch_duration_
        }
    """

    # Unpack results object
    avg_epoch_train_time = np.array([results[e]["time"] for e in results])
    train_loss_epoch = np.array([results[e]["tloss"] for e in results])
    valid_loss_epoch = np.array([results[e]["vloss"] for e in results])
    valid_top1accu_epoch = np.array([results[e]["top1"] for e in results])
    valid_top5accu_epoch = np.array([results[e]["top5"] for e in results])
    
    target = valid_top1accu_epoch.max()
    
    # Save trial result measures for inspection or later reuse.
    payload = {
        'R': R,
        'time_budget_mins': time_budget_mins,
        'params': model_parameters,
        'tloss':train_loss_epoch,
        'vloss':valid_loss_epoch,
        'top1': valid_top1accu_epoch,
        'top5': valid_top5accu_epoch,
        'time': avg_epoch_train_time
    }
    
    with open(f'{dest_dir}/trial_{trial_index}.pkl', 'wb') as handle:
        pickle.dump(payload, handle)
        
    # Need to delete this because otherwise if failed training is skipped the results from last run are picked up instead.
    os.remove(f'experiment_results/{model_type}_result.pkl')
    
    return {
        "target": target,
    }

## 6. Define Ax sampling strategy and initialize experiment

In [None]:
from ax.modelbridge.generation_strategy import GenerationStrategy, GenerationStep
from ax.modelbridge.registry import Models, ModelRegistryBase

gen_strat = GenerationStrategy(
    steps=[
        # 1. Sobol sampling for num_trials
        GenerationStep(
            model=Models.SOBOL,
            num_trials=100,
            min_trials_observed=100,
            max_parallelism=1,
            enforce_num_trials=False
        ),
        # 2. Bayesian optimization using GPEI
        GenerationStep(
            model=Models.GPEI,
            num_trials=-1,  # No limitation
            max_parallelism=1,
        ),
    ]
)

# Destination directory for candidate model evaluations and experiment results to be saved.
dest_dir = 'nas_trial_results'

# Initialize the Ax client
ax_client = AxClient(enforce_sequential_optimization=True, generation_strategy=gen_strat)

# Create an experimeent
ax_client.create_experiment(
    name = "nas_client",
    parameters = parameter_space,
    # parameter_constraints = constraints,
    objective_name = "target",
    minimize = False,
)

## 6a. Run NAS

In [None]:
# Initialize the report and count of non-trivial resuls obtained 'n' 
report = ''
n = 0

# Continue until 200 non-trivial results have been obtained.
while n < 200:
    
    # Get NAS trial parameters
    model_parameters, trial_index = ax_client.get_next_trial()
    model_parameters["R"] = R
    model_parameters["output_size"] = output_size
    
    report += f'TRIAL {trial_index}\n'
    clear_output(wait=True)
    print(report)
    
    # Build model on CPU first to test model size
    try_model = BetaNet(**model_parameters)
    params = sum(p.numel() for p in try_model.parameters() if p.requires_grad)
    
    if params < 30_000_000: # Passes big model params test

        try: # Proceed with experiment

            # Trains candidate model for training time budget minutes.
            evaluation = objective_function(
                trial_index, world_size, model_type, model_parameters, time_budget_mins, nepochs, 
                batch_size, accumulate, train_dataset, valid_dataset, evaluate, saving, dest_dir
            )
            
            # Log successful evaluation with the ax_client.
            ax_client.complete_trial(trial_index=trial_index, raw_data=evaluation)
            
            # Obtain partial experiment results dataframe and save
            result_df = ax_client.get_trials_data_frame()
            result_df.to_pickle(f'{dest_dir}/results_df.pkl')
            
            # Obtain current best parameters from GP model and save
            best_parameters, values = ax_client.get_best_parameters()
            with open(f'{dest_dir}/best_parameters.pkl', 'wb') as handle:
                pickle.dump(best_parameters, handle)
                
            # Increment number of non-trivial results obtained
            n += 1

            # Update the report
            report += f'Saved {n} non-trivial trials. This result {evaluation}\n'
            report += f'{model_parameters}\n'
            with open(f'{dest_dir}/report.pkl', 'wb') as handle:
                pickle.dump(report, handle)
                
            clear_output(wait=True)
            print(report)

        except: # Training potentially failed due to out-of-memory error or otherwise.

            # Log the experiment failure with ax_client - specific parameters are not sampled again.
            # Possible alternative could be to return evaluation of zero.
            ax_client.log_trial_failure(trial_index=trial_index)
            
            # Update the report
            report += 'Failed during experiment run.\n'
            with open(f'{dest_dir}/report.pkl', 'wb') as handle:
                pickle.dump(report, handle)
                
            clear_output(wait=True)
            print(report)

    else: # Model failed number of parameters test

        # Trivial evaluation of zero
        evaluation = {'target': 0}
        ax_client.complete_trial(trial_index=trial_index, raw_data=evaluation)
        
        result_df = ax_client.get_trials_data_frame()
        result_df.to_pickle(f'{dest_dir}/results_df.pkl')
        
        best_parameters, values = ax_client.get_best_parameters()
        with open(f'{dest_dir}/best_parameters.pkl', 'wb') as handle:
            pickle.dump(best_parameters, handle)

        payload = {
            'R': R,
            'time_budget_mins': time_budget_mins,
            'params': model_parameters,
            'tloss': np.array([0]),
            'vloss': np.array([0]),
            'top1': np.array([0]),
            'top5': np.array([0]),
            'time': np.array([0]),
        }

        with open(f'{dest_dir}/trial_{trial_index}.pkl', 'wb') as handle:
            pickle.dump(payload, handle)
            
        # Update the report
        report += 'Failed due to large model scaling.\n'
        with open(f'{dest_dir}/report.pkl', 'wb') as handle:
            pickle.dump(report, handle)
                
        clear_output(wait=True)
        print(report)

        continue

TRIAL 0
Failed due to large model scaling.
TRIAL 1
Failed during experiment run.
TRIAL 2
Failed due to large model scaling.
TRIAL 3
Failed during experiment run.
TRIAL 4
Failed due to large model scaling.
TRIAL 5
Failed due to large model scaling.
TRIAL 6
Saved 1 non-trivial trials. This result {'target': (43.97120575884822, None)}
{'k': 3.0289208739995956, 'K': 3.387609176337719, 'Da': 0.0630921283736825, 'Db': -0.003648870624601841, 'Wa': 0.04659281810745594, 'Wb': -0.4933423949405551, 'G0': 2.1971279941499233, 'G1': 1.4532180596143007, 'B': 0.540430567227304, 'E0': 2.305983327329159, 'E1': 5.019226271659136, 'S0': 16.236738953739405, 'S1': 26.51225995272398, 'SD0': 0.0, 'SD1': 0.2, 'dp': 0.2, 'R': 224, 'output_size': 100}
TRIAL 7
Failed due to large model scaling.
TRIAL 8
Failed due to large model scaling.
TRIAL 9
Failed during experiment run.
TRIAL 10
Failed due to large model scaling.
TRIAL 11
Failed due to large model scaling.
TRIAL 12
Failed due to large model scaling.
TRIAL 13


Process Process-1292:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/workspaces/test_folder/bte_ddp.py", line 270, in init_process
    dist.init_process_group(backend, rank=rank, world_size=world_size)
  File "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 614, in init_process_group
    default_pg = _new_process_group_helper(
  File "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 750, in _new_process_group_helper
    pg = ProcessGroupNCCL(prefix_store, rank, world_size, pg_options)
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!


## 6b. In the event of infra failure: inspect report above, identify trial at which failure occurred, delete all trials saved in dest_dir after that point.

In [None]:
# # Trial after which the failure ocurred
# failure_occurred_after = 253

# saved_trials = [f for f in os.listdir('nas_trial_results') if f.startswith('trial_')]
# trial_indexes = {int(f.split('_')[1].split('.')[0]):f for f in saved_trials}
# del_trials = [f for i,f in trial_indexes.items() if i > failure_occurred_after]

# # Inspect and confirm the trials that will be deleted
# del_trials

In [None]:
# # Delete the trials to be deleted
# for f in del_trials:
#     os.remove(f'nas_trial_results/{f}')

In [24]:
# # Prepare valid trials for loading back into re-started experiment
# saved_trials = [f for f in os.listdir('nas_trial_results') if f.startswith('trial_')]
# saved_trials = sorted(saved_trials, key=lambda s: int(s.split('_')[1].split('.')[0]))

# # Inspect the trials to be recovered and reused.
# saved_trials

In [46]:
# # Sampling strategy to restart with - assumes that faliure ocurred after initial Sobol sampling was completed.
# # If not, may need to introduce some Sobol sampling steps to begin with, or potentially just restart the whole experiment.

# gen_strat = GenerationStrategy(
#     steps=[
#         # 1. Bayesian optimization using GPEI
#         GenerationStep(
#             model=Models.GPEI,
#             num_trials=-1,  # No limitation
#             max_parallelism=1,
#         ),
#     ]
# )

# # Destination directory for candidate model evaluations and experiment results to be saved.
# dest_dir = 'nas_trial_results'

# # Initialize the Ax client
# ax_client = AxClient(enforce_sequential_optimization=True, generation_strategy=gen_strat)

# # Create an experimeent
# ax_client.create_experiment(
#     name = "nas_client",
#     parameters = parameter_space,
#     # parameter_constraints = constraints,
#     objective_name = "target",
#     minimize = False,
# )

# # Re-start report and count of non-trivial results
# report = ''
# n = 0

# # Load recovered trials into re-started experiment
# for trial in saved_trials:
#     with open(f'nas_trial_results/{trial}', 'rb') as handle:
#         results = pickle.load(handle)
#     parameters = results['params']
#     del parameters['R']
#     del parameters['output_size']
#     parameters, trial_index = ax_client.attach_trial(parameters)
#     top1 = float(results['top1'].max())
#     evaluation = {'target': top1}
    
#     if top1 > 0:
#         n += 1
#         report += f'Loaded {n} non-trivial trials. This result: {top1}\n'
        
#     ax_client.complete_trial(trial_index=trial_index, raw_data=evaluation)

[INFO 11-01 01:41:44] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 6 decimal points.
[INFO 11-01 01:41:45] ax.service.utils.instantiation: Created search space: SearchSpace(parameters=[RangeParameter(name='k', parameter_type=FLOAT, range=[2.0, 4.0]), RangeParameter(name='K', parameter_type=FLOAT, range=[3.0, 5.0]), RangeParameter(name='Da', parameter_type=FLOAT, range=[-0.5, 0.5]), RangeParameter(name='Db', parameter_type=FLOAT, range=[-0.5, 0.5]), RangeParameter(name='Wa', parameter_type=FLOAT, range=[-0.2, 0.5]), RangeParameter(name='Wb', parameter_type=FLOAT, range=[-0.5, 0.5]), RangeParameter(name='G0', parameter_type=FLOAT, range=[2.0, 4.0]), RangeParameter(name='G1', parameter_type=FLOAT, range=[1.0, 3.0]), RangeParameter(name='B', parameter_type=FLOAT, range=[0.0, 1.0]), RangeParameter(name='E0', parameter_type=FLOAT, range=[2.0, 6.0]), RangeP

In [None]:
# # Continue with experiment
# while n < 200:
    
#     # Get NAS trial parameters
#     model_parameters, trial_index = ax_client.get_next_trial()
#     model_parameters["R"] = R
#     model_parameters["output_size"] = output_size
    
#     report += f'TRIAL {trial_index}\n'
#     clear_output(wait=True)
#     print(report)
    
#     # Build model on CPU first to test model size
#     try_model = BetaNet(**model_parameters)
#     params = sum(p.numel() for p in try_model.parameters() if p.requires_grad)
    
#     if params < 30_000_000: # Passes big model params test

#         try: # Proceed with experiment

#             # Trains candidate model for training time budget minutes.
#             evaluation = objective_function(
#                 trial_index, world_size, model_type, model_parameters, time_budget_mins, nepochs, 
#                 batch_size, accumulate, train_dataset, valid_dataset, evaluate, saving, dest_dir
#             )
            
#             # Log successful evaluation with the ax_client.
#             ax_client.complete_trial(trial_index=trial_index, raw_data=evaluation)
            
#             # Obtain partial experiment results dataframe and save
#             result_df = ax_client.get_trials_data_frame()
#             result_df.to_pickle(f'{dest_dir}/results_df.pkl')
            
#             # Obtain current best parameters from GP model and save
#             best_parameters, values = ax_client.get_best_parameters()
#             with open(f'{dest_dir}/best_parameters.pkl', 'wb') as handle:
#                 pickle.dump(best_parameters, handle)
                
#             # Increment number of non-trivial results obtained
#             n += 1

#             # Update the report
#             report += f'Saved {n} non-trivial trials. This result {evaluation}\n'
#             report += f'{model_parameters}\n'
#             with open(f'{dest_dir}/report.pkl', 'wb') as handle:
#                 pickle.dump(report, handle)
                
#             clear_output(wait=True)
#             print(report)

#         except: # Training potentially failed due to out-of-memory error or otherwise.

#             # Log the experiment failure with ax_client - specific parameters are not sampled again.
#             # Possible alternative could be to return evaluation of zero.
#             ax_client.log_trial_failure(trial_index=trial_index)
            
#             # Update the report
#             report += 'Failed during experiment run.\n'
#             with open(f'{dest_dir}/report.pkl', 'wb') as handle:
#                 pickle.dump(report, handle)
                
#             clear_output(wait=True)
#             print(report)

#     else: # Model failed number of parameters test

#         # Trivial evaluation of zero
#         evaluation = {'target': 0}
#         ax_client.complete_trial(trial_index=trial_index, raw_data=evaluation)
        
#         result_df = ax_client.get_trials_data_frame()
#         result_df.to_pickle(f'{dest_dir}/results_df.pkl')
        
#         best_parameters, values = ax_client.get_best_parameters()
#         with open(f'{dest_dir}/best_parameters.pkl', 'wb') as handle:
#             pickle.dump(best_parameters, handle)

#         payload = {
#             'R': R,
#             'time_budget_mins': time_budget_mins,
#             'params': model_parameters,
#             'tloss': np.array([0]),
#             'vloss': np.array([0]),
#             'top1': np.array([0]),
#             'top5': np.array([0]),
#             'time': np.array([0]),
#         }

#         with open(f'{dest_dir}/trial_{trial_index}.pkl', 'wb') as handle:
#             pickle.dump(payload, handle)
            
#         # Update the report
#         report += 'Failed due to large model scaling.\n'
#         with open(f'{dest_dir}/report.pkl', 'wb') as handle:
#             pickle.dump(report, handle)
                
#         clear_output(wait=True)
#         print(report)

#         continue

TRIAL 223
Failed due to large model scaling.
TRIAL 224
Failed due to large model scaling.
TRIAL 225
Failed due to large model scaling.
TRIAL 226
Failed due to large model scaling.
TRIAL 227
Failed due to large model scaling.
TRIAL 228
Failed due to large model scaling.
TRIAL 229
Failed due to large model scaling.
TRIAL 230
Failed due to large model scaling.
TRIAL 231
Failed due to large model scaling.
TRIAL 232
Failed during experiment run.
TRIAL 233
Failed due to large model scaling.
TRIAL 234
Failed due to large model scaling.
TRIAL 235
Saved 141 non-trivial trials. This result {'target': (38.09238152369526, None)}
{'k': 2.6802521273493767, 'K': 3.296406101435423, 'Da': 0.40810731798410416, 'Db': -0.47286105435341597, 'Wa': 0.4807143990881741, 'Wb': 0.3482295451685786, 'G0': 2.9936380349099636, 'G1': 1.9052241034805775, 'B': 0.4388819048181176, 'E0': 3.3734178580343723, 'E1': 2.5992591865360737, 'S0': 28.326365016400814, 'S1': 17.81637167558074, 'SD0': 0.0, 'SD1': 0.2, 'dp': 0.2, 'R'