## Hyperparameter tuning

### Setup

In [None]:
import torch
# Check for GPU availability and set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Print GPU info if available
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
    print(f"GPU: {gpu_name} with {gpu_mem:.2f} GB memory")
    
    # Additional GPU optimization for RTX 3070 (8GB VRAM)
    torch.backends.cudnn.benchmark = True  # Optimize for fixed input sizes
    print("CUDNN benchmark enabled for performance optimization")

Using device: cuda:0
GPU: NVIDIA GeForce RTX 3070 with 8.00 GB memory
CUDNN benchmark enabled for performance optimization


In [None]:
import config
import mlflow
import glob
import os
from functions.utility import analyze_webdataset
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="webdataset")

# Configure MLflow
mlflow.set_experiment("animals10")

# Define constants
DATA_DIR = "./data/webdataset/"

# Relative file paths
config.TRAIN_PATHS = sorted(glob.glob(os.path.join(DATA_DIR, "train-*.tar")))
config.TEST_PATHS = sorted(glob.glob(os.path.join(DATA_DIR, "test-*.tar")))
# Absolute paths
# config.TRAIN_PATHS = sorted([os.path.abspath(path) for path in glob.glob(os.path.join(DATA_DIR, "train-*.tar"))])
# config.TEST_PATHS = sorted([os.path.abspath(path) for path in glob.glob(os.path.join(DATA_DIR, "test-*.tar"))])


print(f"Found {len(config.TRAIN_PATHS)} training files and {len(config.TEST_PATHS)} test files")

num_classes, class_names, class_weights = analyze_webdataset(DATA_DIR, verbose=False)
print(f"\nTraining data summary:")
print(f"Number of classes: {num_classes}")
print(f"Class names: {class_names}")
print(f"Class weights tensor shape: {class_weights.shape}")

# Update the config module variables
config.NUM_CLASSES = num_classes
config.CLASS_NAMES = class_names
config.CLASS_WEIGHTS = class_weights


Found 22 training files and 3 test files

Training data summary:
Number of classes: 10
Class names: ['spider', 'dog', 'chicken', 'horse', 'butterfly', 'squirrel', 'cow', 'sheep', 'cat', 'elephant']
Class weights tensor shape: torch.Size([10])


### Hyperparameter search
3 fold cross validation was used. 

Aggressive pruning by first fold minimum threshold as well as optunas median pruning strategy by running validation accuracy average after each fold. 

Metric used in optuna optimization is the lower bound of the average validation accuracy from the collective best epoch of all folds measured with the t-distribution at 80% confidence. 

Each trial is stored using ML Flow and can be viewed by typing mlflow ui in the terminal. 

Hyperparameter study from optuna uses SQLite and stored in the root project folder using db_path variable.

In [None]:
# Run the k-fold cross validation optimization
from functions.hyperopt import run_kfold_optuna_optimization
db_path = "optuna_animals10_kfold.db"
k_fold_study = run_kfold_optuna_optimization(
    n_trials=1,      # Number of trials
    k=3,             # Number of folds
    verbose=False,   # Reduce output
    storage=db_path, # Store results in SQLite
    load_if_exists=True,
    first_fold_min_acc=95.0  # Minimum accuracy for the first fold
)

Using SQLite storage at: sqlite:///optuna_animals10_kfold.db
Loaded existing study with 1 previous trials.
Could not load existing study: Record does not exist.
Best avg validation at epoch 10: 97.79% ± 0.24%
Objective value - t-dist Lower confidence bound (80.0%): 97.64%
Trial 1 completed with value: 97.64

K-Fold Study statistics:
  Number of finished trials: 2
  Number of pruned trials: 0
  Best trial:
    Value: 97.63728594742982 t-dist 80% lower bound
    Params:
      learning_rate: 0.0001329291894316216
      batch_size: 8
      weight_decay: 2.9380279387035354e-06
      dropout_rate: 0.07799726016810132
      augmentation_intensity: medium
      patience: 8
      max_epochs: 10
