In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# Import Libraries

In [14]:
import torch
import torch.nn as nn
import pandas as pd
import os
from torchvision.transforms import transforms
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import yaml
import mlflow

from dataset import FruitImagesDataset
from utils import YoloLoss, intersection_over_union, non_max_supression, mean_average_precision, get_bboxes, save_checkpoint, load_checkpoint
from nets import YoloV1

In [3]:
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x116e911f0>

# Dataset Preprocessing

In [4]:
files_dir = 'dataset/train_zip/train'
test_dir = 'dataset/test_zip/test'

images = [image for image in sorted(os.listdir(files_dir)) if image[-4:]=='.jpg']
annots = [image[:-4] + '.xml' for image in images]

images = pd.Series(images, name='images')
annots = pd.Series(annots, name='annots')
df = pd.concat([images, annots], axis=1)
df = pd.DataFrame(df)

test_images = [image for image in sorted(os.listdir(test_dir)) if image[-4:]=='.jpg']
test_annots = [image[:-4] + '.xml' for image in test_images]

test_images = pd.Series(test_images, name='test_images')
test_annots = pd.Series(test_annots, name='test_annots')
test_df = pd.concat([test_images, test_annots], axis=1)
test_df = pd.DataFrame(test_df)

# Model Training

In [5]:
# Constants
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
WEIGHT_DECAY = 0
EPOCHS = 20
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "model.pth"

In [6]:
# Training function with interactive progress bar
def train_one_epoch(train_loader, model, optimizer, loss_fn, epoch, total_epochs, scheduler=None):
    model.train()
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(DEVICE), y.to(DEVICE)

        # Forward pass
        predictions = model(x)
        loss = loss_fn(predictions, y)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Structured progress bar updates
        loop.set_description(f"Epoch [{epoch + 1}/{total_epochs}]")
        loop.set_postfix({
            'Batch': f"{batch_idx + 1}/{len(train_loader)}",
            'Loss': f"{loss.item():.4f}",
            'Mean Loss': f"{total_loss / (batch_idx + 1):.4f}",
            'LR': optimizer.param_groups[0]['lr']
        })

    mean_loss = total_loss / len(train_loader)
    print(f"Mean loss for epoch {epoch + 1}: {mean_loss:.4f}")
    return mean_loss

In [7]:
# Evaluation function to calculate mAP (Mean Average Precision)
def evaluate_model(loader, model, iou_threshold=0.5, threshold=0.4):
    model.eval()
    pred_boxes, target_boxes = get_bboxes(loader, model, iou_threshold=iou_threshold, threshold=threshold, device=DEVICE)
    mean_avg_prec = mean_average_precision(pred_boxes, target_boxes, iou_threshold=iou_threshold, box_format="midpoint", num_classes=3)
    return mean_avg_prec

In [8]:
class Compose(object):
    def __init__(self,transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes

In [9]:

# Main training pipeline
def train_model():

    # Load the MLflow configuration
    with open('mlflow_config.yaml', 'r') as file:
        mlflow_config = yaml.safe_load(file)
    
    # Start MLflow experiment
    mlflow.set_tracking_uri(mlflow_config['server_url'])
    mlflow.set_experiment(mlflow_config['experiment_name'])

    with mlflow.start_run():

        model = YoloV1(split_size=7, num_boxes=2, num_classes=3).to(DEVICE)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=0.1, patience=3, mode='max', verbose=True)
        loss_fn = YoloLoss()

        # Log model parameters
        mlflow.log_param("split_size", 7)
        mlflow.log_param("num_boxes", 2)
        mlflow.log_param("num_classes", 3)
        mlflow.log_param("learning_rate", LEARNING_RATE)
        mlflow.log_param("weight_decay", WEIGHT_DECAY)
        mlflow.log_param("batch_size", BATCH_SIZE)
        mlflow.log_param("epochs", EPOCHS)

        # Load model checkpoint if required
        if LOAD_MODEL:
            load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

        transform = Compose([transforms.Resize((448, 448)), transforms.ToTensor()])

        # Prepare data
        train_dataset = FruitImagesDataset(df=df, transform=transform, files_dir=files_dir)
        train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)

        for epoch in range(EPOCHS):
            train_loss = train_one_epoch(train_loader, model, optimizer, loss_fn, epoch, EPOCHS)

            mlflow.log_metric("train_loss", train_loss, step=epoch)

            train_mAP = evaluate_model(train_loader, model)
            print(f"Train mAP for epoch {epoch + 1}: {train_mAP}")

            # Log mAP
            mlflow.log_metric("train_mAP", train_mAP, step=epoch)

            # Adjust learning rate
            scheduler.step(train_mAP)

            # Save model checkpoint
            checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
            save_checkpoint(checkpoint, filename=LOAD_MODEL_FILE)

            # Log the checkpoint as an artifact
            mlflow.log_artifact(LOAD_MODEL_FILE)

        mlflow.pytorch.log_model(model, "model")


In [15]:

# Execute training
if __name__ == "__main__":
    train_model()


2024/10/12 22:27:23 INFO mlflow.tracking.fluent: Experiment with name 'YoloV1_from_scratch' does not exist. Creating a new experiment.
  load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)


=> Loading Checkpoint


Epoch [1/20]: 100%|██████████| 15/15 [02:58<00:00, 11.90s/it, Batch=15/15, Loss=47.9110, Mean Loss=54.1809, LR=2e-5]


Mean loss for epoch 1: 54.1809
Train mAP for epoch 1: 0.7312399744987488
=> Saving Checkpoint


Epoch [2/20]:  47%|████▋     | 7/15 [01:31<01:42, 12.78s/it, Batch=7/15, Loss=43.1191, Mean Loss=47.5606, LR=2e-5]

In [None]:
LOAD_MODEL = True
LOAD_MODEL_FILE = "model.pth"

# Prediction pipeline
def predict_model():
    model = YoloV1(split_size=7, num_boxes=2, num_classes=3).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    loss_fn = YoloLoss()

    # Load the model
    if LOAD_MODEL:
        load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

    transform = Compose([transforms.Resize((448, 448)), transforms.ToTensor()])

    # Prepare data
    test_dataset = FruitImagesDataset(df=test_df, transform=transform, files_dir=test_dir)
    test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)

    test_mAP = evaluate_model(test_loader, model)
    print(f"Test mAP: {test_mAP}")


predict_model()

  load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)


=> Loading Checkpoint


NameError: name 'transform' is not defined