# experiment

> Notebook where the training experiments take place

In [None]:
#| default_exp experiment

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *
from fastcore.utils import *

In [None]:
#| hide
import os

if os.getenv("COLAB_RELEASE_TAG"):
   print("Running in Colab")       
   from google.colab import drive
   drive.mount('/content/drive')
   %cd /content/drive/MyDrive/GitHub/birdclef_2023
   %pip install wandb
else:
   print("NOT in Colab")

NOT in Colab


In [None]:
#| export
import wandb

from birdclef.trainer import train

These are the variables that must be set to start an experiment:

1. **project**: The name of the wandb project where the training, evaluation and test results will be logged and stored.

2. **entity**: The wandb entity associated with the project.

3. **sweep_name**: The name given to the sweep configuration, which defines the hyperparameter search setup for an experiment. It's used to organize and categorize different hyperparameter tuning runs.

4. **method**: The method or strategy used for hyperparameter tuning. In this case, 'random' suggests that hyperparameters will be randomly chosen from the specified ranges or values during the sweep.

5. **n_runs**: The number of runs or iterations that will be performed during the hyperparameter sweep. Each run involves training the model with a specific set of hyperparameters.

6. **run_name**: The name given to each individual run or iteration of the experiment. It helps identify and differentiate between different runs, providing a meaningful label for tracking and analysis.

7. **device**: The computational device (e.g., 'cpu', 'cuda') on which the training and evaluation of the model will be performed.

8. **train_key**: Key or identifier used to access the training dataset. Refer to `get_dataset` for info about available keys.

9. **val_key**: Key or identifier used to access the validation dataset. Refer to `get_dataset` for info about available keys.

10. **test_key**: Key or identifier used to access the test dataset. Refer to `get_dataset` for info about available keys.

11. **batch_size**: The number of samples in each mini-batch during training. It affects the efficiency of the training process and the model's ability to generalize.

12. **num_workers**: The number of worker threads used to load data in parallel during training. It can help speed up the data loading process.

13. **pin_memory**: A boolean indicating whether to pin memory for faster data transfer to the GPU. This is often beneficial when using a GPU for training.

14. **model_key**: Key or identifier used to specify the model architecture to be used for training. Refer to `get_model` for info about available keys.

15. **optimizer_key**: Key or identifier used to specify the optimizer to be used during the training process. Refer to `get_optimizer` for info about available keys.

16. **learning_rate**: A list of learning rates to be used by the optimizer during training. Learning rate is a crucial hyperparameter affecting the convergence and performance of the model.

17. **loss_key**: Key or identifier used to specify the loss function to be used during training. Refer to `get_loss_func` for info about available keys.

18. **metric**: The metric used to evaluate the model's performance. This metric is used to compare and choose the best model in a single run. Refer to `compute_metrics` for info about available metrics.

19. **epochs**: The number of epochs or complete passes through the training dataset during the training process. One epoch is a single pass through the entire training dataset.

20. **callback_step**: A callback function will be called every *n* steps where *n* is the number defined as *callback_step*

21. **callback_func**: 

In [None]:
project = 'bird-clef-lr-scheduler-test'
entity = '4projects'

sweep_name = 'test_lr_on_plateau' 
method = 'random'
n_runs = 1

run_name = 'test4' 
device = 'cpu' 
train_key = 'train_simple' 
val_key = 'val_simple' 
test_key = 'test_simple'
batch_size = 16
num_workers = 2
pin_memory = False
model_key = 'efficient_net_v2_s' 
optimizer_key = 'adamw' 
learning_rate = [0.001] 
loss_key = 'ce'
metric = 'f1'
epochs = 1
callback_step = 1
callback_key = ''
scheduler_key = 'cosine'
scheduler_metric = 'loss'
scheduler_step = 2
start_factor = 1
end_factor = 1e-6
scheduler_verbose = 1
scheduler_patience = 5
scheduler_eta_min = 1e-9

Creating the experiment configuration as dict.

In [None]:
sweep_config = {
    'name': sweep_name,
    'method': method,
    'parameters': {
        'run_name': {
            'value': run_name
        },
        'device': {
            'value': device
        },
        'train_key': {
            'value': train_key
        },
        'train_kwargs': {
            'parameters': {
                'batch_size': {
                    'value': batch_size
                },
                'shuffle': {
                    'value': True
                },
                'num_workers': {
                    'value': num_workers 
                },
                'pin_memory': {
                    'value': pin_memory
                }
            }
        },
        'val_key': {
            'value': val_key
        },
        'test_key': {
            'value': test_key
        },
        'val_kwargs': {
            'parameters': {
                'batch_size': {
                    'value': batch_size
                },
                'shuffle': {
                    'value': False
                },
                'num_workers': {
                    'value': num_workers 
                },
                'pin_memory': {
                    'value': pin_memory
                }
            }
        },
        'model_key': {
            'value': model_key
        },
        'optimizer_key': {
            'value': optimizer_key
        },
        'optimizer_kwargs': {
            'parameters': {
                'lr': {
                    'values': learning_rate
                },
            }
        },
        'loss_key': {
            'value': loss_key
        },
        'metric': {
            'value': metric
        },
        'epochs': {
            'value': epochs
        },
        'callback_step': {
            'value': callback_step
        },
        'callback_key': {
            'value': callback_key
        },
        'lr_scheduler_key' : {
            'value' : scheduler_key
        },
        'lr_scheduler_kwargs' : {
            'parameters' : {
                'start_factor' : {
                    'value' : start_factor
                },
                'end_factor' : {
                    'value' : end_factor
                },
                'verbose' : {
                    'value' : scheduler_verbose
                },
                'patience' : {
                    'value' : scheduler_patience
                },
                'scheduler_step' : {
                    'value' : scheduler_step
                },
                'scheduler_metric' : {
                    'value' : scheduler_metric
                },
                'eta_min' : {
                    'value' : scheduler_eta_min
                }
            }
        }
    }  
}

In [None]:
#| hide
# Need to change wd when running in colab
if os.getenv("COLAB_RELEASE_TAG"):
  %cd /content/drive/MyDrive/GitHub/birdclef_2023/nbs

Running a sweep.

In [None]:
#|eval: false
#|output: false
sweep_id = wandb.sweep(sweep_config, project=project, entity=entity)
wandb.agent(
    sweep_id,
    train,
    count=n_runs)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: qw3ercv4
Sweep URL: https://wandb.ai/4projects/bird-clef-lr-scheduler-test/sweeps/qw3ercv4


[34m[1mwandb[0m: Agent Starting Run: vxxew73m with config:
[34m[1mwandb[0m: 	callback_key: 
[34m[1mwandb[0m: 	callback_step: 1
[34m[1mwandb[0m: 	device: cpu
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	loss_key: ce
[34m[1mwandb[0m: 	lr_scheduler_key: cosine
[34m[1mwandb[0m: 	lr_scheduler_kwargs: {'end_factor': 1e-06, 'eta_min': 1e-09, 'patience': 5, 'scheduler_metric': 'loss', 'scheduler_step': 2, 'start_factor': 1, 'verbose': 1}
[34m[1mwandb[0m: 	metric: f1
[34m[1mwandb[0m: 	model_key: efficient_net_v2_s
[34m[1mwandb[0m: 	optimizer_key: adamw
[34m[1mwandb[0m: 	optimizer_kwargs: {'lr': 0.001}
[34m[1mwandb[0m: 	run_name: test4
[34m[1mwandb[0m: 	test_key: test_simple
[34m[1mwandb[0m: 	train_key: train_simple
[34m[1mwandb[0m: 	train_kwargs: {'batch_size': 16, 'num_workers': 2, 'pin_memory': False, 'shuffle': True}
[34m[1mwandb[0m: 	val_key: val_simple
[34m[1mwandb[0m: 	val_kwargs: {'batch_size': 16, 'num_workers': 2, 'pin_memory': Fal

Adjusting learning rate of group 0 to 1.0000e-03.
Training epoch 0


  4%|▍         | 2/53 [00:16<06:05,  7.18s/it]

Adjusting learning rate of group 0 to 9.9635e-04.


  8%|▊         | 4/53 [00:21<03:32,  4.33s/it]

Adjusting learning rate of group 0 to 9.8547e-04.


 11%|█▏        | 6/53 [00:27<02:36,  3.33s/it]

Adjusting learning rate of group 0 to 9.6751e-04.


 15%|█▌        | 8/53 [00:32<02:11,  2.91s/it]

Adjusting learning rate of group 0 to 9.4273e-04.


 19%|█▉        | 10/53 [00:37<01:57,  2.72s/it]

Adjusting learning rate of group 0 to 9.1149e-04.


 23%|██▎       | 12/53 [00:42<01:47,  2.62s/it]

Adjusting learning rate of group 0 to 8.7426e-04.


 26%|██▋       | 14/53 [00:47<01:45,  2.70s/it]

Adjusting learning rate of group 0 to 8.3156e-04.


 30%|███       | 16/53 [00:52<01:36,  2.60s/it]

Adjusting learning rate of group 0 to 7.8403e-04.


 34%|███▍      | 18/53 [00:58<01:30,  2.59s/it]

Adjusting learning rate of group 0 to 7.3236e-04.


 38%|███▊      | 20/53 [01:03<01:27,  2.64s/it]

Adjusting learning rate of group 0 to 6.7730e-04.


 42%|████▏     | 22/53 [01:08<01:20,  2.60s/it]

Adjusting learning rate of group 0 to 6.1966e-04.


 45%|████▌     | 24/53 [01:13<01:15,  2.61s/it]

Adjusting learning rate of group 0 to 5.6027e-04.


 49%|████▉     | 26/53 [01:19<01:12,  2.67s/it]

Adjusting learning rate of group 0 to 5.0000e-04.


 53%|█████▎    | 28/53 [01:24<01:03,  2.54s/it]

Adjusting learning rate of group 0 to 4.3973e-04.


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()