# experiment

> Notebook where the training experiments take place

In [None]:
#| default_exp experiment

In [None]:
#| hide
import os
if not os.getenv("COLAB_RELEASE_TAG"):
    from nbdev.showdoc import *
    from fastcore.test import *
    from fastcore.utils import *

In [None]:
#| hide
if os.getenv("COLAB_RELEASE_TAG"):
   print("Running in Colab")       
   from google.colab import drive
   drive.mount('/content/drive')
   %cd /content/drive/MyDrive/GitHub/birdclef_2023
   %pip install wandb
else:
   print("NOT in Colab")

NOT in Colab


In [None]:
#| export
import wandb

from birdclef.trainer import train

These are the variables that must be set to start an experiment:

1. **project**: The name of the wandb project where the training, evaluation and test results will be logged and stored.

2. **entity**: The wandb entity associated with the project.

3. **sweep_name**: The name given to the sweep configuration, which defines the hyperparameter search setup for an experiment. It's used to organize and categorize different hyperparameter tuning runs.

4. **method**: The method or strategy used for hyperparameter tuning. In this case, 'random' suggests that hyperparameters will be randomly chosen from the specified ranges or values during the sweep.

5. **n_runs**: The number of runs or iterations that will be performed during the hyperparameter sweep. Each run involves training the model with a specific set of hyperparameters.

6. **run_name**: The name given to each individual run or iteration of the experiment. It helps identify and differentiate between different runs, providing a meaningful label for tracking and analysis.

7. **device**: The computational device (e.g., 'cpu', 'cuda') on which the training and evaluation of the model will be performed.

8. **train_key**: Key or identifier used to access the training dataset. Refer to `get_dataset` for info about available keys.

9. **val_key**: Key or identifier used to access the validation dataset. Refer to `get_dataset` for info about available keys.

10. **test_key**: Key or identifier used to access the test dataset. Refer to `get_dataset` for info about available keys.

11. **batch_size**: The number of samples in each mini-batch during training. It affects the efficiency of the training process and the model's ability to generalize.

12. **num_workers**: The number of worker threads used to load data in parallel during training. It can help speed up the data loading process.

13. **pin_memory**: A boolean indicating whether to pin memory for faster data transfer to the GPU. This is often beneficial when using a GPU for training.

14. **model_key**: Key or identifier used to specify the model architecture to be used for training. Refer to `get_model` for info about available keys.

15. **optimizer_key**: Key or identifier used to specify the optimizer to be used during the training process. Refer to `get_optimizer` for info about available keys.

16. **learning_rate**: A list of learning rates to be used by the optimizer during training. Learning rate is a crucial hyperparameter affecting the convergence and performance of the model.

17. **loss_key**: Key or identifier used to specify the loss function to be used during training. Refer to `get_loss_func` for info about available keys.

18. **metric**: The metric used to evaluate the model's performance. This metric is used to compare and choose the best model in a single run. Refer to `compute_metrics` for info about available metrics.

19. **epochs**: The number of epochs or complete passes through the training dataset during the training process. One epoch is a single pass through the entire training dataset.

20. **callback_step**: A callback function will be called every *n* steps where *n* is the number defined as *callback_step*

21. **callback_func**: 

The existing keys are:
- train_base
- val_base
- test_base
- train_simple
- val_simple
- test_simple
- train_simple_per_channel
- val_simple_per_channel
- test_simple_per_channel
- train_base_per_channel
- val_base_per_channel
- test_base_per_channel
- train_base_pcn_aug
- val_base_pcn_aug
- test_base_pcn_aug
- train_base_pcn_rnd
- val_base_pcn_rnd
- test_base_pcn_rnd
- train_base_pcn_aug_rnd
- val_base_pcn_aug_rnd
- test_base_pcn_aug_rnd

In [None]:
project = 'bird-clef-cjavelanda'
entity = '4projects'

sweep_name = 'sweep-per_channel' 
method = 'random'
n_runs = 1
run_name = 'per_channel_linear_0005' 
device = 'cuda' 
train_key = 'train_base_per_channel'
val_key = 'val_base_per_channel' 
test_key = 'test_base_per_channel'
batch_size = 128
num_workers = 16
pin_memory = True
model_key = 'efficient_net_v2_s' 
optimizer_key = 'adamw' 
learning_rate = [0.0005] 
loss_key = 'ce'
metric = 'f1'
epochs = 11
callback_step = 100
callback_key = 'show'
scheduler_key = 'linear'
scheduler_metric = 'loss'
scheduler_step = 2
start_factor = 1
end_factor = 1e-6
scheduler_verbose = 1
scheduler_patience = 5
scheduler_eta_min = 1e-9

Creating the experiment configuration as dict.

In [None]:
sweep_config = {
    'name': sweep_name,
    'method': method,
    'parameters': {
        'run_name': {
            'value': run_name
        },
        'device': {
            'value': device
        },
        'train_key': {
            'value': train_key
        },
        'train_kwargs': {
            'parameters': {
                'batch_size': {
                    'value': batch_size
                },
                'shuffle': {
                    'value': True
                },
                'num_workers': {
                    'value': num_workers 
                },
                'pin_memory': {
                    'value': pin_memory
                }
            }
        },
        'val_key': {
            'value': val_key
        },
        'test_key': {
            'value': test_key
        },
        'val_kwargs': {
            'parameters': {
                'batch_size': {
                    'value': batch_size
                },
                'shuffle': {
                    'value': False
                },
                'num_workers': {
                    'value': num_workers 
                },
                'pin_memory': {
                    'value': pin_memory
                }
            }
        },
        'model_key': {
            'value': model_key
        },
        'optimizer_key': {
            'value': optimizer_key
        },
        'optimizer_kwargs': {
            'parameters': {
                'lr': {
                    'values': learning_rate
                },
            }
        },
        'loss_key': {
            'value': loss_key
        },
        'metric': {
            'value': metric
        },
        'epochs': {
            'value': epochs
        },
        'callback_step': {
            'value': callback_step
        },
        'callback_key': {
            'value': callback_key
        },
        'lr_scheduler_key' : {
            'value' : scheduler_key
        },
        'lr_scheduler_kwargs' : {
            'parameters' : {
                'start_factor' : {
                    'value' : start_factor
                },
                'end_factor' : {
                    'value' : end_factor
                },
                'verbose' : {
                    'value' : scheduler_verbose
                },
                'patience' : {
                    'value' : scheduler_patience
                },
                'scheduler_step' : {
                    'value' : scheduler_step
                },
                'scheduler_metric' : {
                    'value' : scheduler_metric
                },
                'eta_min' : {
                    'value' : scheduler_eta_min
                }
            }
        }
    }  
}

In [None]:
#| hide
# Need to change wd when running in colab
if os.getenv("COLAB_RELEASE_TAG"):
  %cd /content/drive/MyDrive/GitHub/birdclef_2023/nbs

Running a sweep.

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
#|eval: false
#|output: false
sweep_id = wandb.sweep(sweep_config, project=project, entity=entity)
wandb.agent(
    sweep_id,
    train,
    count=n_runs)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: 8mo1tqm5
Sweep URL: https://wandb.ai/4projects/bird-clef-cjavelanda/sweeps/8mo1tqm5


[34m[1mwandb[0m: Agent Starting Run: cqw7h9b6 with config:
[34m[1mwandb[0m: 	callback_key: show
[34m[1mwandb[0m: 	callback_step: 100
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	epochs: 11
[34m[1mwandb[0m: 	loss_key: ce
[34m[1mwandb[0m: 	lr_scheduler_key: linear
[34m[1mwandb[0m: 	lr_scheduler_kwargs: {'end_factor': 1e-06, 'eta_min': 1e-09, 'patience': 5, 'scheduler_metric': 'loss', 'scheduler_step': 2, 'start_factor': 1, 'verbose': 1}
[34m[1mwandb[0m: 	metric: f1
[34m[1mwandb[0m: 	model_key: efficient_net_v2_s
[34m[1mwandb[0m: 	optimizer_key: adamw
[34m[1mwandb[0m: 	optimizer_kwargs: {'lr': 0.0005}
[34m[1mwandb[0m: 	run_name: per_channel_linear_0005
[34m[1mwandb[0m: 	test_key: test_base_per_channel
[34m[1mwandb[0m: 	train_key: train_base_per_channel
[34m[1mwandb[0m: 	train_kwargs: {'batch_size': 128, 'num_workers': 16, 'pin_memory': True, 'shuffle': True}
[34m[1mwandb[0m: 	val_key: val_base_per_channel
[34m[1mwandb[0m: 	val_kwar

Adjusting learning rate of group 0 to 5.0000e-04.
Training epoch 0


  0%|          | 0/78 [00:00<?, ?it/s]Traceback (most recent call last):
  File "h:\birds\birdclef_2023\birdclef\trainer.py", line 174, in train
    metrics, example_ct, step_ct = train_one_epoch(model, train_dl, loss_func, optimizer, config.device, epoch, example_ct, step_ct, n_steps_per_epoch, config.callback_step, callback_func, config.lr_scheduler_kwargs["scheduler_step"], config.lr_scheduler_kwargs["scheduler_metric"], lr_scheduler)
  File "h:\birds\birdclef_2023\birdclef\trainer.py", line 56, in train_one_epoch
    for step, data in enumerate(train_dl):
  File "h:\Birds\birdclef_2023\.venv\lib\site-packages\torch\utils\data\dataloader.py", line 630, in __next__
    data = self._next_data()
  File "h:\Birds\birdclef_2023\.venv\lib\site-packages\torch\utils\data\dataloader.py", line 1345, in _next_data
    return self._process_data(data)
  File "h:\Birds\birdclef_2023\.venv\lib\site-packages\torch\utils\data\dataloader.py", line 1371, in _process_data
    data.reraise()
  File "h:\

Run cqw7h9b6 errored: LibsndfileError('Caught LibsndfileError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n  File "h:\\Birds\\birdclef_2023\\.venv\\lib\\site-packages\\torch\\utils\\data\\_utils\\worker.py", line 308, in _worker_loop\n    data = fetcher.fetch(index)\n  File "h:\\Birds\\birdclef_2023\\.venv\\lib\\site-packages\\torch\\utils\\data\\_utils\\fetch.py", line 51, in fetch\n    data = [self.dataset[idx] for idx in possibly_batched_index]\n  File "h:\\Birds\\birdclef_2023\\.venv\\lib\\site-packages\\torch\\utils\\data\\_utils\\fetch.py", line 51, in <listcomp>\n    data = [self.dataset[idx] for idx in possibly_batched_index]\n  File "h:\\birds\\birdclef_2023\\birdclef\\dataset.py", line 163, in __getitem__\n    mel_spectrogram = self.pipeline(filename)\n  File "h:\\Birds\\birdclef_2023\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py", line 1518, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File "h:\\Birds\\bir

In [None]:
#| hide
if not os.getenv("COLAB_RELEASE_TAG"):
    import nbdev; nbdev.nbdev_export()