# Finetuning with COCO Format

We can now setup a script to finetune with pytorch lightning on top of standard coco format data.

This is based upon the following example: https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/train-huggingface-detr-on-custom-dataset.ipynb#scrollTo=qk6sRB0lueHY

In [None]:
%pip install -U timm transformers supervision roboflow lightning mlflow pycocotools
%pip install psutil pynvml 
%restart_python

## Setup Configurations

This will initialise all the variables that we need.

We need to specify the Catalog / Schema for UC and we will use UC Volumes to store all the data.

In [None]:
import lightning as pl
from lightning.pytorch.loggers import MLFlowLogger

import torch
from torch.utils.data import DataLoader
import torchvision

from transformers import DetrForObjectDetection, DetrImageProcessor

import supervision as sv
import os

import mlflow

ds_catalog = 'brian_ml_dev'
ds_schame = 'image_processing'
coco_volume = 'coco_dataset'
save_dir = '/local_disk0/train'

mlflow_experiment = '/Users/brian.law@databricks.com/brian_lightning'

volume_path = f"/Volumes/{ds_catalog}/{ds_schame}/{coco_volume}"
logging_volume_path = f"/Volumes/{ds_catalog}/{ds_schame}/training"
image_path = f'{volume_path}'
annotation_json = f'{volume_path}/annotations.json'

CHECKPOINT = 'facebook/detr-resnet-50'

In [None]:
# create the logging vol if necessary
spark.sql(f"CREATE VOLUME IF NOT EXISTS {ds_catalog}.{ds_schame}.training")

## Setting Up Data Loader

The first step is setting up the dataset and the dataloaders for our model training job

In [None]:
class CocoDetection(torchvision.datasets.CocoDetection):
    """
    This dataset structures the input format to suit what we expect for the model
    """
    
    def __init__(
        self, 
        image_directory_path: str, 
        image_processor, 
        train: bool = True
    ):
        annotation_file_path = annotation_json
        super(CocoDetection, self).__init__(image_directory_path, annotation_file_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        images, annotations = super(CocoDetection, self).__getitem__(idx)        
        image_id = self.ids[idx]
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.image_processor(images=images, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]

        return pixel_values, target

In [None]:
image_processor = DetrImageProcessor.from_pretrained(CHECKPOINT)

def collate_fn(batch):
    # DETR authors employ various image sizes during training, making it not possible 
    # to directly batch together images. Hence they pad the images to the biggest 
    # resolution in a given batch, and create a corresponding binary pixel_mask 
    # which indicates which pixels are real/which are padding
    pixel_values = [item[0] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt")
    labels = [item[1] for item in batch]
    return {
        'pixel_values': encoding['pixel_values'],
        'pixel_mask': encoding['pixel_mask'],
        'labels': labels
    }

In [None]:
# Create the train and val dataloaders

TRAIN_DATASET = CocoDetection(
    image_directory_path=image_path, 
    image_processor=image_processor, 
    train=True)

print("Number of training examples:", len(TRAIN_DATASET))

TRAIN_DATALOADER = DataLoader(dataset=TRAIN_DATASET, collate_fn=collate_fn, batch_size=4, shuffle=True)
VAL_DATALOADER = DataLoader(dataset=TRAIN_DATASET, collate_fn=collate_fn, batch_size=4)

categories = TRAIN_DATASET.coco.cats
id2label = {k: v['name'] for k,v in categories.items()}

In [None]:
#### Test out the batch loader
for batch in TRAIN_DATALOADER:
  print(batch)
  break 

## Setting up the Lightning Model Module

In [None]:

class Detr(pl.LightningModule):

    def __init__(self, lr, lr_backbone, weight_decay):
        super().__init__()
        self.model = DetrForObjectDetection.from_pretrained(
            pretrained_model_name_or_path=CHECKPOINT, 
            num_labels=len(id2label),
            ignore_mismatched_sizes=True
        )
        
        self.lr = lr
        self.lr_backbone = lr_backbone
        self.weight_decay = weight_decay

    def forward(self, pixel_values, pixel_mask):
        return self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    def common_step(self, batch, batch_idx):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]
        labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

        outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)

        loss = outputs.loss
        loss_dict = outputs.loss_dict

        return loss, loss_dict

    def training_step(self, batch, batch_idx):
        #logger_module = self.logger.experiment
        
        loss, loss_dict = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step, and the average across the epoch
        self.logger.log_metrics({"training_loss": loss}, batch_idx)
        for k,v in loss_dict.items():
            self.logger.log_metrics({"train_" + k: v.item()}, batch_idx)

        return loss

    def validation_step(self, batch, batch_idx):
        #logger_module = self.logger.experiment
        loss, loss_dict = self.common_step(batch, batch_idx)     
        self.logger.log_metrics({"validation/loss": loss}, batch_idx)
        for k, v in loss_dict.items():
            self.logger.log_metrics({"validation_" + k: v.item()}, batch_idx)
            
        return loss

    def configure_optimizers(self):
        # DETR authors decided to use different learning rate for backbone
        # you can learn more about it here: 
        # - https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/main.py#L22-L23
        # - https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/main.py#L131-L139
        param_dicts = [
            {
                "params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
            {
                "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                "lr": self.lr_backbone,
            },
        ]
        return torch.optim.AdamW(param_dicts, lr=self.lr, weight_decay=self.weight_decay)

    def train_dataloader(self):
        return TRAIN_DATALOADER

    def val_dataloader(self):
        return VAL_DATALOADER

# Training Loop - Single GPU in notebook

We will start with single gpu in the notebook.
Once we scale we will need to change the way that we approach this

In [None]:
# settings
MAX_EPOCHS = 10

model = Detr(lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4)

# it is best to set it here
## Otherwise it seems glitchy
mlflow.set_experiment(mlflow_experiment)

#os.environ['MLFLOW_EXPERIMENT_NAME'] = mlflow_experimentz
os.environ['MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING'] = 'true'

# we need to start the logger here as it creates a new run
with mlflow.start_run(log_system_metrics=True, run_name='training_test') as run:

    # this will log our training hyuperparams
    mlflow.pytorch.autolog()

    mlf_logger = MLFlowLogger(
        #experiment_name=mlflow_experiment,
        tracking_uri="databricks",
        checkpoint_path_prefix="brian_testing",
        run_id=run.info.run_id
    )

# we need to see why this gets logged to the wrong experiment
# MLFLOW_SYSTEM_METRICS_NODE_ID - use this one for setting different node prefixes for distributed

# pytorch_lightning < 2.0.0
# trainer = Trainer(gpus=1, max_epochs=MAX_EPOCHS, gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=5)

# pytorch_lightning >= 2.0.0
    trainer = pl.Trainer(devices=1, accelerator="gpu", max_epochs=MAX_EPOCHS, 
                     gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=5,
                     logger=mlf_logger)

    trainer.fit(model)

# Moving to TorchDistributor

In order to distribute effectively across databricks, we will use TorchDistributor

As that results in our code running in separate python threads, we need to setup credentials in order to make sure that we can communicate back with mlflow.

In [None]:
from pyspark.ml.torch.distributor import TorchDistributor

### Setup of MLFlow variables

### distributed mlflow configs - we need to manually set the login creds
browser_host = spark.conf.get("spark.databricks.workspaceUrl")
db_host = f"https://{browser_host}"
db_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

# Training Loop - Multi GPU Single Node

When we go to multi gpu with lightning, we need to move to torch distributor
(This will be different per framework ie HuggingFace native can do in notebook multi-gpu okay)

In [None]:
num_gpus_per_node = 2
num_nodes = 1
num_processes = num_gpus_per_node * num_nodes
local_status = True if num_nodes == 1 else False

MAX_EPOCHS = 3

def training_function(total_gpus: int):
    
    # we need to set the mlflow vars here due to glitches with os environ
    os.environ['MLFLOW_TRACKING_URI'] = 'databricks'
    os.environ['DATABRICKS_HOST'] = db_host
    os.environ['DATABRICKS_TOKEN'] = db_token
    
    mlflow.set_experiment(mlflow_experiment)
    
    model = Detr(lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4)
    
    rank = int(os.environ.get("RANK", 0))

    if rank == 0:
        with mlflow.start_run(log_system_metrics=True, run_name='training_test') as run:

            # this will log our training hyuperparams
            mlflow.pytorch.autolog()

            mlf_logger = MLFlowLogger(
            #experiment_name=mlflow_experiment,
                tracking_uri="databricks",
                run_id=run.info.run_id,
                save_dir=f'{logging_volume_path}',
                checkpoint_path_prefix=f'{logging_volume_path}/checkpoints'
            )
            
            # note the addition of strategy here
            trainer = pl.Trainer(default_root_dir=f'{volume_path}/training',
                                 devices=total_gpus, accelerator="gpu", max_epochs=MAX_EPOCHS, 
                        gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=5,
                        logger=mlf_logger, strategy='ddp')

            trainer.fit(model)
    else:
        mlf_logger = None
        
        # note the addition of strategy here
        trainer = pl.Trainer(default_root_dir=f'{volume_path}/training',
                             devices=total_gpus, accelerator="gpu", max_epochs=MAX_EPOCHS, 
                        gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=5,
                        logger=mlf_logger, strategy='ddp')

        trainer.fit(model)
        
    return trainer
    

distributor = TorchDistributor(num_processes=num_processes, 
                               local_mode=local_status, use_gpu=True)

train_obj = distributor.run(training_function, num_processes)

# Distributed Cluster Training Job

In order to train on a cluster we will need to package up the code even more.

This becomes a bit hard to follow within the notebook format so although we could keep it within the notebook it'll be easier to package it up into a training script. This will also make it easier to transfomer to our newer model training product that is currently under development

In [None]:
import json
from pyspark.ml.torch.distributor import TorchDistributor

from mlflow.data.uc_volume_dataset_source import UCVolumeDatasetSource
from mlflow.data.dataset import Dataset
from mlflow.data import load_delta

num_gpus_per_node = 2
num_nodes = 2
num_processes = num_gpus_per_node * num_nodes
local_status = True if num_nodes == 1 else False

source_data = spark.sql(f"SELECT * FROM {ds_catalog}.{ds_schame}.gold_detr_results_training_frame")
### Test out multi-node training

mlflow.set_experiment(mlflow_experiment)

with mlflow.start_run(run_name='multi-node') as run:
  
  source_table = load_delta(
    table_name=f'{ds_catalog}.{ds_schame}.silver_detr_results_w_frame',
    name = "run_source_data"
  )
  
  dataset = UCVolumeDatasetSource(path=f'{volume_path}')
  #digest = compute_folder_digest(volume_path)

  #source_table = Dataset(source=source_table, name="train_source_table")
  # digest is a uuid type keycode to identify a dataset usually the first few char of some hash 
  
  cache_train = Dataset(source=dataset, name="train_dataset", digest='abc')
  cache_val = Dataset(source=dataset, name="val_dataset", digest='abc')

  mlflow.log_input(source_table, context="upstream_table")
  mlflow.log_input(cache_train, context="train_dataset")
  mlflow.log_input(cache_val, context="val_dataset")

  distributor = TorchDistributor(num_processes=num_processes, 
                               local_mode=local_status, use_gpu=True)

  entry_dict = {'batch_size': num_processes*3,
              'max_epochs':2,
              'total_gpus': num_processes,
              'host': db_host,
              'token': db_token,
              'run_id': run.info.run_id,
              'uc_catalog': ds_catalog,
              'uc_schema': ds_schame,
              'mlflow_experiment': mlflow_experiment}

  train_obj = distributor.run('scripts/tune_model.py', json.dumps(entry_dict))