## Imports

In [14]:
import os
import platform
import random
import kagglehub
import mlflow
import mlflow.pytorch
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import tqdm as notebook_tqdm # Needed for tqdm in Jupyter Notebook (Certain cell outputs will complain if this is not included)
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from torchvision.models import ResNet18_Weights

## Hyperparameters Options

In [15]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EXPERIMENT_NAME = "fire-smoke-detection-resnet-tuning"
SEED = 42
NUM_EPOCHS = 10
NUM_TRIALS = 5
BATCH_SIZE_OPTIONS = [16, 32, 64, 128]
LEARNING_RATE_OPTIONS = [1e-4, 1e-3, 1e-2]
WEIGHT_DECAY_OPTIONS = [1e-6, 1e-5, 1e-4]
EARLY_STOP_PATIENCE = 3

## Download Dataset

In [16]:
DATASET_PATH = kagglehub.dataset_download("sayedgamal99/smoke-fire-detection-yolo")

## Data Augmentation Options
Input images are expected to be 224x224

In [17]:
TRAIN_TRANSFORM = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

EVAL_TRANSFORM = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

## Construct Custom Dataset
The original dataset is structure as such:
- []  = 'No Smoke and No Fire'
- 0 = 'Smoke Only'
- 1 = 'Fire and Smoke'

The custom dataset modifies this as such:
- 0 = 'No Smoke and No Fire'
- 1 = 'Smoke Only'
- 2 = 'Fire and Smoke'

In [18]:
class CustomDataset(Dataset):
    def __init__(self, images_dir, labels_dir, transform=None):
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.transform = transform
        self.image_files = sorted(os.listdir(images_dir))

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.images_dir, img_name)
        label_path = os.path.join(self.labels_dir, img_name.replace(".jpg", ".txt"))

        image = Image.open(img_path).convert("RGB")
        with open(label_path, "r") as f:
            label_content = f.read().strip()

        # 0: none, 1: smoke, 2: fire
        if not label_content:
            label = 0
        else:
            first_number = int(label_content.split()[0])
            label = 1 if first_number == 0 else 2

        if self.transform:
            image = self.transform(image)

        return image, label

## Training Code
The training parameters are provided by the Optuna Trails

In [19]:
def train_with_params(params: dict, train_dataset, val_dataset, test_dataset) -> float:
    """
    Train the model using the provided parameters and datasets.
    Returns the best validation loss.
    """
    
    batch_size = params["batch_size"]
    learning_rate = params["learning_rate"]
    weight_decay = params["weight_decay"]
    num_epochs = params["num_epochs"]
    early_stop_patience = params["early_stop_patience"]

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    device = DEVICE
    model = models.resnet18(weights=ResNet18_Weights.DEFAULT)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 3)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1)

    run_name = f"bs{batch_size}_lr{learning_rate:.0e}_wd{weight_decay:.0e}"
    with mlflow.start_run(nested=True, run_name=run_name):
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("weight_decay", weight_decay)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("num_epochs", num_epochs)
        mlflow.log_param("early_stop_patience", early_stop_patience)
        mlflow.log_param("optimizer", optimizer.__class__.__name__)
        mlflow.log_param("scheduler", scheduler.__class__.__name__)
        mlflow.log_param("platform", platform.platform())
        mlflow.log_param("python_version", platform.python_version())

        best_val_loss = float('inf')
        best_model_state = None
        best_epoch = -1
        epochs_no_improve = 0
        train_losses = []
        val_losses = []

        for epoch in range(num_epochs):
            model.train()
            train_loss = 0.0
            for batch_idx, (inputs, labels) in enumerate(train_loader, 1):
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
                train_losses.append(loss.item())
                mlflow.log_metric("batch_training_loss", loss.item(), step=epoch * len(train_loader) + batch_idx)
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Training Loss: {loss.item():.4f}")

            model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0
            with torch.no_grad():
                for val_inputs, val_labels in val_loader:
                    val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
                    val_outputs = model(val_inputs)
                    v_loss = criterion(val_outputs, val_labels)
                    val_loss += v_loss.item()
                    _, val_predicted = torch.max(val_outputs, 1)
                    val_correct += (val_predicted == val_labels).sum().item()
                    val_total += val_labels.size(0)

            avg_train_loss = train_loss / len(train_loader)
            avg_val_loss = val_loss / len(val_loader)
            val_losses.append(avg_val_loss)
            val_accuracy = val_correct / val_total if val_total > 0 else 0
            mlflow.log_metric("training_loss", avg_train_loss, step=epoch)
            mlflow.log_metric("validation_loss", avg_val_loss, step=epoch)
            mlflow.log_metric("validation_accuracy", val_accuracy, step=epoch)
            print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

            checkpoint = {
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "scheduler_state_dict": scheduler.state_dict(),
                "best_val_loss": best_val_loss,
            }
            checkpoint_path = f"checkpoint_epoch_{epoch+1}.pth"
            torch.save(checkpoint, checkpoint_path)
            mlflow.log_artifact(checkpoint_path)
            os.remove(checkpoint_path)

            if avg_val_loss < best_val_loss:
                epochs_no_improve = 0
                best_model_state = model.state_dict()
                best_val_loss = avg_val_loss
                best_epoch = epoch
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= early_stop_patience:
                    break

            scheduler.step(avg_val_loss)
            best_val_accuracy = max(val_losses)
            mlflow.log_metric("best_val_accuracy", best_val_accuracy)
            mlflow.log_metric("learning_rate", optimizer.param_groups[0]['lr'], step=epoch)

        if best_model_state is not None:
            model.load_state_dict(best_model_state)
            best_checkpoint = {
                "epoch": best_epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "scheduler_state_dict": scheduler.state_dict(),
                "best_val_loss": best_val_loss,
            }
            torch.save(best_checkpoint, "best_model.pth")
            mlflow.log_artifact("best_model.pth")
            os.remove("best_model.pth")

        test_loss = 0.0
        test_correct = 0
        test_total = 0
        with torch.no_grad():
            for test_inputs, test_labels in test_loader:
                test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)
                outputs = model(test_inputs)
                loss = criterion(outputs, test_labels)
                test_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                test_correct += (predicted == test_labels).sum().item()
                test_total += test_labels.size(0)
        avg_test_loss = test_loss / len(test_loader)
        test_accuracy = test_correct / test_total if test_total > 0 else 0
        mlflow.log_metric("test_loss", avg_test_loss)
        mlflow.log_metric("test_accuracy", test_accuracy)
        print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

        # Plot the confusion matrix
        y_true, y_pred = [], []
        with torch.no_grad():
            for test_inputs, test_labels in test_loader:
                test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)
                outputs = model(test_inputs)
                _, predicted = torch.max(outputs, 1)
                y_true.extend(test_labels.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())
        cm = confusion_matrix(y_true, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap=plt.cm.Blues)
        plt.title("Test Confusion Matrix")
        plt.tight_layout()
        plt.savefig("test_confusion_matrix.png")
        mlflow.log_artifact("test_confusion_matrix.png")
        plt.close()
        os.remove("test_confusion_matrix.png")

        # Calculate average training loss per epoch
        num_batches_per_epoch = len(train_loader)
        train_loss_per_epoch = [
            np.mean(train_losses[i * num_batches_per_epoch : (i + 1) * num_batches_per_epoch])
            for i in range(len(val_losses))
        ]

        # Plot training loss per epoch
        plt.figure()
        plt.plot(range(1, len(train_loss_per_epoch) + 1), train_loss_per_epoch, marker="o", label="Training Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training Loss per Epoch")
        plt.legend()
        plt.tight_layout()
        plt.savefig("training_loss_per_epoch.png")
        mlflow.log_artifact("training_loss_per_epoch.png")
        plt.close()
        os.remove("training_loss_per_epoch.png")

        # Plot validation loss per epoch
        plt.figure()
        plt.plot(range(1, len(val_losses) + 1), val_losses, marker="o", color="orange", label="Validation Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Validation Loss per Epoch")
        plt.legend()
        plt.tight_layout()
        plt.savefig("validation_loss_per_epoch.png")
        mlflow.log_artifact("validation_loss_per_epoch.png")
        plt.close()
        os.remove("validation_loss_per_epoch.png")

        return best_val_loss

## Experiment and Trails Set-Up

In [20]:
def objective(trial, train_dataset, val_dataset, test_dataset):
    params = {
        "batch_size": trial.suggest_categorical("batch_size", BATCH_SIZE_OPTIONS),
        "learning_rate": trial.suggest_float("learning_rate", LEARNING_RATE_OPTIONS[0], LEARNING_RATE_OPTIONS[-1]),
        "weight_decay": trial.suggest_float("weight_decay", WEIGHT_DECAY_OPTIONS[0], WEIGHT_DECAY_OPTIONS[-1], log=True),
        "num_epochs": NUM_EPOCHS,
        "early_stop_patience": EARLY_STOP_PATIENCE,
    }
    return train_with_params(params, train_dataset, val_dataset, test_dataset)

def start_experiment():
    # Set seed for reproducibility
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)

    # Load training dataset
    train_dataset = CustomDataset(
        images_dir= os.path.join(DATASET_PATH, "data/train/images"),
        labels_dir= os.path.join(DATASET_PATH, "data/train/labels"),
        transform=TRAIN_TRANSFORM
    )

    # Load validation dataset
    val_dataset = CustomDataset(
        images_dir= os.path.join(DATASET_PATH, "data/val/images"),
        labels_dir= os.path.join(DATASET_PATH, "data/val/labels"),
        transform=EVAL_TRANSFORM
    )

    # Load test dataset
    test_dataset = CustomDataset(
        images_dir= os.path.join(DATASET_PATH, "data/test/images"),
        labels_dir= os.path.join(DATASET_PATH, "data/test/labels"),
        transform=EVAL_TRANSFORM
    )

    # Create study
    study = optuna.create_study(direction="minimize", study_name=EXPERIMENT_NAME)
    mlflow.set_experiment(EXPERIMENT_NAME)
    study.optimize(
        lambda trial: objective(trial, train_dataset, val_dataset, test_dataset),
        n_trials=NUM_TRIALS
    )

    # Print best trial
    print("Best trial:")
    print(f"  Value (best validation loss): {study.best_trial.value}")
    print("  Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

    # Log best trial info with MLflow
    mlflow.log_metric("best_val_loss", study.best_trial.value)
    for key, value in study.best_trial.params.items():
        mlflow.log_param(f"best_{key}", value)

## Start the Experiment

In [21]:
start_experiment()

[I 2025-04-27 19:21:15,696] A new study created in memory with name: fire-smoke-detection-resnet-tuning


Epoch 1/10, Batch 1/111, Training Loss: 1.2017
Epoch 1/10, Batch 2/111, Training Loss: 0.8627
Epoch 1/10, Batch 3/111, Training Loss: 0.8118
Epoch 1/10, Batch 4/111, Training Loss: 0.8121
Epoch 1/10, Batch 5/111, Training Loss: 0.7322
Epoch 1/10, Batch 6/111, Training Loss: 0.5535
Epoch 1/10, Batch 7/111, Training Loss: 0.6809
Epoch 1/10, Batch 8/111, Training Loss: 0.5731
Epoch 1/10, Batch 9/111, Training Loss: 0.5743
Epoch 1/10, Batch 10/111, Training Loss: 0.5422
Epoch 1/10, Batch 11/111, Training Loss: 0.6057
Epoch 1/10, Batch 12/111, Training Loss: 0.6286
Epoch 1/10, Batch 13/111, Training Loss: 0.6150
Epoch 1/10, Batch 14/111, Training Loss: 0.6555
Epoch 1/10, Batch 15/111, Training Loss: 0.6015
Epoch 1/10, Batch 16/111, Training Loss: 0.4681
Epoch 1/10, Batch 17/111, Training Loss: 0.6879
Epoch 1/10, Batch 18/111, Training Loss: 0.5191
Epoch 1/10, Batch 19/111, Training Loss: 0.5891
Epoch 1/10, Batch 20/111, Training Loss: 0.3996
Epoch 1/10, Batch 21/111, Training Loss: 0.6512
E

[I 2025-04-27 19:42:16,688] Trial 0 finished with value: 0.2196991708036512 and parameters: {'batch_size': 128, 'learning_rate': 0.0005523846634426587, 'weight_decay': 7.827086112748106e-05}. Best is trial 0 with value: 0.2196991708036512.


Epoch 1/10, Batch 1/883, Training Loss: 1.4911
Epoch 1/10, Batch 2/883, Training Loss: 1.9218
Epoch 1/10, Batch 3/883, Training Loss: 2.5364
Epoch 1/10, Batch 4/883, Training Loss: 4.6537
Epoch 1/10, Batch 5/883, Training Loss: 4.2209
Epoch 1/10, Batch 6/883, Training Loss: 1.5251
Epoch 1/10, Batch 7/883, Training Loss: 1.8556
Epoch 1/10, Batch 8/883, Training Loss: 1.3658
Epoch 1/10, Batch 9/883, Training Loss: 1.2086
Epoch 1/10, Batch 10/883, Training Loss: 1.0472
Epoch 1/10, Batch 11/883, Training Loss: 1.4641
Epoch 1/10, Batch 12/883, Training Loss: 2.1578
Epoch 1/10, Batch 13/883, Training Loss: 0.8838
Epoch 1/10, Batch 14/883, Training Loss: 0.8370
Epoch 1/10, Batch 15/883, Training Loss: 1.1974
Epoch 1/10, Batch 16/883, Training Loss: 1.2286
Epoch 1/10, Batch 17/883, Training Loss: 0.9854
Epoch 1/10, Batch 18/883, Training Loss: 0.9684
Epoch 1/10, Batch 19/883, Training Loss: 1.0462
Epoch 1/10, Batch 20/883, Training Loss: 1.2593
Epoch 1/10, Batch 21/883, Training Loss: 0.9387
E

[I 2025-04-27 20:03:40,605] Trial 1 finished with value: 0.5372937212316511 and parameters: {'batch_size': 16, 'learning_rate': 0.0046246250285297795, 'weight_decay': 7.14177291770463e-05}. Best is trial 0 with value: 0.2196991708036512.


Epoch 1/10, Batch 1/442, Training Loss: 1.6141
Epoch 1/10, Batch 2/442, Training Loss: 1.6847
Epoch 1/10, Batch 3/442, Training Loss: 3.2437
Epoch 1/10, Batch 4/442, Training Loss: 3.2642
Epoch 1/10, Batch 5/442, Training Loss: 1.8986
Epoch 1/10, Batch 6/442, Training Loss: 1.4026
Epoch 1/10, Batch 7/442, Training Loss: 1.2354
Epoch 1/10, Batch 8/442, Training Loss: 1.2611
Epoch 1/10, Batch 9/442, Training Loss: 1.3275
Epoch 1/10, Batch 10/442, Training Loss: 1.0095
Epoch 1/10, Batch 11/442, Training Loss: 0.9113
Epoch 1/10, Batch 12/442, Training Loss: 0.9545
Epoch 1/10, Batch 13/442, Training Loss: 2.0563
Epoch 1/10, Batch 14/442, Training Loss: 1.0994
Epoch 1/10, Batch 15/442, Training Loss: 1.0934
Epoch 1/10, Batch 16/442, Training Loss: 1.1653
Epoch 1/10, Batch 17/442, Training Loss: 0.9365
Epoch 1/10, Batch 18/442, Training Loss: 0.8920
Epoch 1/10, Batch 19/442, Training Loss: 0.9190
Epoch 1/10, Batch 20/442, Training Loss: 1.0503
Epoch 1/10, Batch 21/442, Training Loss: 1.3044
E

[I 2025-04-27 20:23:59,560] Trial 2 finished with value: 0.446455329368563 and parameters: {'batch_size': 32, 'learning_rate': 0.0036333389999519293, 'weight_decay': 1.2022950123015283e-06}. Best is trial 0 with value: 0.2196991708036512.


Epoch 1/10, Batch 1/883, Training Loss: 1.3854
Epoch 1/10, Batch 2/883, Training Loss: 3.8887
Epoch 1/10, Batch 3/883, Training Loss: 2.9946
Epoch 1/10, Batch 4/883, Training Loss: 2.3365
Epoch 1/10, Batch 5/883, Training Loss: 1.9075
Epoch 1/10, Batch 6/883, Training Loss: 2.8220
Epoch 1/10, Batch 7/883, Training Loss: 3.1399
Epoch 1/10, Batch 8/883, Training Loss: 2.9855
Epoch 1/10, Batch 9/883, Training Loss: 1.9982
Epoch 1/10, Batch 10/883, Training Loss: 2.0543
Epoch 1/10, Batch 11/883, Training Loss: 1.2037
Epoch 1/10, Batch 12/883, Training Loss: 1.0874
Epoch 1/10, Batch 13/883, Training Loss: 2.0908
Epoch 1/10, Batch 14/883, Training Loss: 1.5815
Epoch 1/10, Batch 15/883, Training Loss: 0.6989
Epoch 1/10, Batch 16/883, Training Loss: 2.0420
Epoch 1/10, Batch 17/883, Training Loss: 2.1085
Epoch 1/10, Batch 18/883, Training Loss: 1.3202
Epoch 1/10, Batch 19/883, Training Loss: 1.1209
Epoch 1/10, Batch 20/883, Training Loss: 1.3163
Epoch 1/10, Batch 21/883, Training Loss: 1.4990
E

[I 2025-04-27 20:44:53,349] Trial 3 finished with value: 0.484209561086207 and parameters: {'batch_size': 16, 'learning_rate': 0.007188326518439874, 'weight_decay': 5.235169847231183e-06}. Best is trial 0 with value: 0.2196991708036512.


Epoch 1/10, Batch 1/111, Training Loss: 1.1284
Epoch 1/10, Batch 2/111, Training Loss: 2.4873
Epoch 1/10, Batch 3/111, Training Loss: 2.0630
Epoch 1/10, Batch 4/111, Training Loss: 1.6952
Epoch 1/10, Batch 5/111, Training Loss: 1.6865
Epoch 1/10, Batch 6/111, Training Loss: 1.0415
Epoch 1/10, Batch 7/111, Training Loss: 1.2857
Epoch 1/10, Batch 8/111, Training Loss: 1.6115
Epoch 1/10, Batch 9/111, Training Loss: 1.1464
Epoch 1/10, Batch 10/111, Training Loss: 1.0288
Epoch 1/10, Batch 11/111, Training Loss: 1.5675
Epoch 1/10, Batch 12/111, Training Loss: 1.1233
Epoch 1/10, Batch 13/111, Training Loss: 1.0533
Epoch 1/10, Batch 14/111, Training Loss: 1.0696
Epoch 1/10, Batch 15/111, Training Loss: 1.0273
Epoch 1/10, Batch 16/111, Training Loss: 1.5196
Epoch 1/10, Batch 17/111, Training Loss: 1.2469
Epoch 1/10, Batch 18/111, Training Loss: 1.3240
Epoch 1/10, Batch 19/111, Training Loss: 1.2535
Epoch 1/10, Batch 20/111, Training Loss: 1.0707
Epoch 1/10, Batch 21/111, Training Loss: 1.1785
E

[I 2025-04-27 21:06:00,586] Trial 4 finished with value: 0.511853081882 and parameters: {'batch_size': 128, 'learning_rate': 0.007663800832372427, 'weight_decay': 9.880842173137917e-05}. Best is trial 0 with value: 0.2196991708036512.


Best trial:
  Value (best validation loss): 0.2196991708036512
  Params: 
    batch_size: 128
    learning_rate: 0.0005523846634426587
    weight_decay: 7.827086112748106e-05
