# Model Training

This notebook demonstrates the model training pipeline using PyTorch Lightning and Ray for distributed training.

In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
# Install dependencies
%pip install pytorch-lightning ray[default] mlflow pycocotools albumentations
dbutils.library.restartPython()

In [0]:
# Import required modules
import ray
from ray import train
import pytorch_lightning as pl
import mlflow
from models.base.base_model import BaseModel, ModelConfig
from trainer.ray_trainer import RayTrainer
from trainer.base_trainer import BaseTrainer
from models.architectures.classification import ClassificationModel
from data.processing.coco_processor import COCOProcessor
from data.processing.data_loader import COCODataset, get_transforms, create_dataloader
import torch
import torch.nn as nn

## Initialize Ray

Set up Ray for distributed training.

In [0]:
import os
import sys

# Add your project directory to PYTHONPATH
project_dir = os.path.abspath("..")  # or the specific path to your project
os.environ['PYTHONPATH'] = f"{project_dir}:{os.environ.get('PYTHONPATH', '')}"
os.getenv("PYTHONPATH")

In [0]:
from mlflow.utils.databricks_utils import get_databricks_env_vars

# Set databricks credentials as env vars
mlflow_dbrx_creds = get_databricks_env_vars("databricks")
os.environ["DATABRICKS_HOST"] = mlflow_dbrx_creds['DATABRICKS_HOST']
os.environ["DATABRICKS_TOKEN"] = mlflow_dbrx_creds['DATABRICKS_TOKEN']

In [0]:
# Initialize Ray
ray.init()
print(ray.cluster_resources())

In [0]:
# from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster
# from mlflow.utils.databricks_utils import get_databricks_env_vars
# import ray
# import os
# import sys

# # Cluster cleanup
# restart = True
# if restart is True:
#   try:
#     shutdown_ray_cluster()
#   except:
#     pass

#   try:
#     ray.shutdown()
#   except:
#     pass

# # Set configs based on your cluster size
# num_cpu_cores_per_worker = 47
# num_cpus_head_node = 12
# num_gpus_head_node = 1
# num_gpus_worker_node = 4

# # Set databricks credentials as env vars
# mlflow_dbrx_creds = get_databricks_env_vars("databricks")
# os.environ["DATABRICKS_HOST"] = mlflow_dbrx_creds['DATABRICKS_HOST']
# os.environ["DATABRICKS_TOKEN"] = mlflow_dbrx_creds['DATABRICKS_TOKEN']

# # Get the current working directory to include in the Python path
# current_dir = os.getcwd()
# workspace_dir = "/Workspace/Users/alex.miller@databricks.com/databricks-cv-architecture"

# # Configure runtime environment to include your custom modules
# runtime_env = {
#     "py_modules": [workspace_dir],  # Include your workspace directory
#     "env_vars": {
#         "PYTHONPATH": f"{workspace_dir}:{os.environ.get('PYTHONPATH', '')}"
#     }
# }

# ray_conf = setup_ray_cluster(
#   min_worker_nodes=1,
#   max_worker_nodes=1,
#   num_cpus_head_node=num_cpus_head_node,
#   num_cpus_per_node=num_cpu_cores_per_worker,
#   num_gpus_head_node=num_gpus_head_node,
#   num_gpus_worker_node=num_gpus_worker_node,
#   runtime_env=runtime_env  # Pass the runtime environment
# )

# os.environ['RAY_ADDRESS'] = ray_conf[0]
# ray.init(runtime_env=runtime_env)  # Also set runtime_env here


## Configure Model

Set up model configuration and architecture.

In [0]:
# Create model configuration
config = ModelConfig(
    learning_rate=1e-4,
    weight_decay=1e-5,
    task='classification',
    optimizer='adamw',
    scheduler='cosine',
    scheduler_params={'T_max': 15, 'eta_min': 1e-6},  # Explicitly set T_max
    max_epochs=1
)


# Initialize model
classification_model = ClassificationModel()
model = BaseModel(model=classification_model, config=config)

### Setup Dataloader:
- Option 1: COCODataset -> PyTorch dataloader -> pass into Ray trainer
- Option 2: Delta table -> Ray data -> pass into Ray trainer
- Option 3: COCODataset -> Ray data -> pass into Ray trainer

In [0]:
image_dir = "/Volumes/users/aradhya_chouhan/coco_mini-train/data/val2017/"
annotation_file = "/Volumes/users/aradhya_chouhan/coco_mini-train/data/instances_val2017.json"

dataset = COCODataset(image_dir=image_dir, annotation_file=annotation_file, transform=get_transforms(mode='train'))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [0.8, 0.2])
# Create dataloader
train_dataloader = create_dataloader(
    train_dataset,
    batch_size=8,
    num_workers=4,
    shuffle=True
)
val_dataloader = create_dataloader(
    val_dataset,
    batch_size=8,
    num_workers=4,
    shuffle=True
)

## Set Up Training

Configure the training pipeline with Ray.

In [0]:
# Initialize trainer
base_trainer = BaseTrainer(
    model=model,
    model_config=config
)
trainer = RayTrainer(
    trainer=base_trainer,
    num_workers=int(ray.cluster_resources().get("GPU")),
    use_gpu=True,
    databricks_host=os.environ["DATABRICKS_HOST"],
    databricks_token=os.environ["DATABRICKS_TOKEN"]
)

# Configure training
# notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
training_config = {
    "experiment_name": f"/Users/alex.miller@databricks.com/cv_experiment",
    "run_name": "training_run_1",
    "max_epochs": 100,
    "checkpoint_dir": "/Volumes/main/alex_m/coco_dataset/checkpoints",
    "model_path": "/Volumes/main/alex_m/coco_dataset/model",
    "train_loader": train_dataloader,
    "val_loader": val_dataloader
}

In [0]:
training_config

In [0]:
# import os

# os.mkdir("/Volumes/main/alex_m/coco_dataset/checkpoints")
# os.mkdir("/Volumes/main/alex_m/coco_dataset/model")

## Start Training

Begin the distributed training process.

In [0]:
# Start training
result = trainer.train(training_config)

# Display training results
print("Training completed!")
print(f"Best model path: {result['best_model_path']}")
print(f"Final metrics: {result['metrics']}")

## Visualize Training Progress

Plot training metrics and learning curves.

In [0]:
import matplotlib.pyplot as plt

def plot_metrics(metrics):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(metrics['train_loss'], label='Train Loss')
    plt.plot(metrics['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(metrics['learning_rate'], label='Learning Rate')
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Plot training metrics
plot_metrics(result['metrics'])