<a href="https://colab.research.google.com/github/Dinhthixuanbinh/object-detection/blob/main/custom_mask_rcnn_training_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="...")
project = rf.workspace("hadv2").project("had-v2")
version = project.version(8)
dataset = version.download("coco")


Collecting roboflow
  Downloading roboflow-1.1.49-py3-none-any.whl.metadata (9.7 kB)
Collecting filetype (from roboflow)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading roboflow-1.1.49-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.9/80.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Installing collected packages: filetype, roboflow
Successfully installed filetype-1.2.0 roboflow-1.1.49
loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in HAD-v2-8 to coco:: 100%|██████████| 803926/803926 [00:18<00:00, 43910.63it/s]





Extracting Dataset Version Zip to HAD-v2-8 in coco:: 100%|██████████| 9168/9168 [00:02<00:00, 3653.10it/s]


In [None]:
%%writefile /kaggle/working/dataset.py

import os
import torch
from torch.utils.data import Dataset
from PIL import Image
import numpy as np
from pycocotools.coco import COCO
from loguru import logger

class COCODetectionDataset(Dataset):
    def __init__(self, root, annotation_file, transforms=None):
        self.root = root
        self.transforms = transforms
        self.coco = COCO(annotation_file)

        self.class_map = {
            'bus': 1,
            'car': 2,
            'crosswalk': 3,
            'person': 4,
            'stop sign': 5,
            'traffic light': 6,
            'truck': 7
        }

        self.category_mapping = {}
        for cat in self.coco.loadCats(self.coco.getCatIds()):
            if cat['name'] in self.class_map:
                self.category_mapping[cat['id']] = self.class_map[cat['name']]

        all_image_ids = self.coco.getImgIds()
        valid_image_ids = []
        for img_id in all_image_ids:
            ann_ids = self.coco.getAnnIds(imgIds=img_id)
            if len(ann_ids) > 0:
                valid_image_ids.append(img_id)
        self.ids = sorted(valid_image_ids)

        logger.info(f"Loaded dataset with {len(self.ids)} valid images and {len(self.class_map)} classes")
        logger.info(f"Classes: {list(self.class_map.keys())}")

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        coco_annotations = self.coco.loadAnns(ann_ids)
        image_info = self.coco.loadImgs(img_id)[0]

        try:
            image_path = os.path.join(self.root, image_info['file_name'])
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            logger.error(f"Error loading image {image_path}: {str(e)}")
            return None

        boxes = []
        labels = []
        areas = []
        iscrowd = []

        for ann in coco_annotations:
            if ann['category_id'] not in self.category_mapping:
                continue

            x_min, y_min, width, height = ann['bbox']
            if width <= 0 or height <= 0:
                continue

            x_max = x_min + width
            y_max = y_min + height

            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(self.category_mapping[ann['category_id']] - 1)
            areas.append(width * height)
            iscrowd.append(ann.get('iscrowd', 0))

        if not boxes:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
            areas = torch.zeros((0,), dtype=torch.float32)
            iscrowd = torch.zeros((0,), dtype=torch.int64)
        else:
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.as_tensor(labels, dtype=torch.int64)
            areas = torch.as_tensor(areas, dtype=torch.float32)
            iscrowd = torch.as_tensor(iscrowd, dtype=torch.int64)

        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id': torch.tensor([img_id]),
            'area': areas,
            'iscrowd': iscrowd
        }

        if self.transforms is not None:
            image = self.transforms(image)

        return image, target

Writing /kaggle/working/dataset.py


In [None]:
%%writefile /kaggle/working/GPU_optimizer.py
# GPU_optimizer.py
import torch
from torch.utils.data import DataLoader
from loguru import logger
import time

class GPUOptimizer:
    """Utility class for GPU memory optimization in deep learning training."""

    def __init__(self, device='cuda'):
        self.device = device
        self.grad_scaler = torch.cuda.amp.GradScaler()

    @staticmethod
    def get_optimal_batch_size(model, sample_input, device, start_batch_size=4):
        """Dynamically find the largest batch size that fits in memory."""
        batch_size = start_batch_size
        while batch_size > 0:
            try:
                # Try to process a batch
                sample_batch = [sample_input] * batch_size
                with torch.cuda.amp.autocast():
                    _ = model(sample_batch)
                torch.cuda.empty_cache()
                return batch_size
            except torch.cuda.OutOfMemoryError:
                batch_size = batch_size // 2
                torch.cuda.empty_cache()
        raise RuntimeError("Could not find a valid batch size")

    @staticmethod
    def optimize_dataloader(dataset, batch_size, num_workers, collate_fn=None):
        """Create a memory-optimized DataLoader."""
        return DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=num_workers,
            collate_fn=collate_fn,
            pin_memory=True,
            persistent_workers=True
        )

    def get_cuda_streams(self):
        """Get CUDA streams for overlapped computation."""
        return {
            'data': torch.cuda.Stream(),
            'compute': torch.cuda.Stream()
        }

    @staticmethod
    def get_memory_stats():
        """Get current GPU memory statistics."""
        return {
            'allocated': f"{torch.cuda.memory_allocated() / 1024**2:.1f}MB",
            'cached': f"{torch.cuda.memory_reserved() / 1024**2:.1f}MB",
            'max_allocated': f"{torch.cuda.max_memory_allocated() / 1024**2:.1f}MB"
        }

    @staticmethod
    def handle_oom_error(iteration):
        """Handle out-of-memory errors."""
        torch.cuda.empty_cache()
        memory_stats = GPUOptimizer.get_memory_stats()
        logger.error(
            f"OOM error at iteration {iteration}. "
            f"Current memory state: {memory_stats}"
        )

    def enable_memory_efficient_training(self, model):
        """Enable memory-efficient training features for the model."""
        if hasattr(model, 'gradient_checkpointing_enable'):
            model.gradient_checkpointing_enable()
        return model

    def to_device(self, data, non_blocking=True):
        """Efficiently move data to GPU."""
        if isinstance(data, (list, tuple)):
            return [self.to_device(item) for item in data]
        elif isinstance(data, dict):
            return {k: self.to_device(v) for k, v in data.items()}
        elif hasattr(data, 'to'):
            return data.to(self.device, non_blocking=non_blocking)
        return data

Writing /kaggle/working/GPU_optimizer.py


In [None]:
%%writefile /kaggle/working/trainer.py

import os
import time
import torch
import numpy as np
from loguru import logger
import torchvision
from torchvision.models.detection import (
    maskrcnn_resnet50_fpn,
    fasterrcnn_resnet50_fpn
)
from torch.utils.data import DataLoader
from config import TrainingConfig
from GPU_optimizer import GPUOptimizer

class ObjectDetectionTrainer:
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.device = torch.device(config.DEVICE)
        self.gpu_optimizer = GPUOptimizer(self.device)
        self._setup_logging()

    def _setup_logging(self):
        logger.add(self.config.LOG_FILE, rotation="100 MB")
        logger.info(f"Using device: {self.device}")

    @staticmethod
    def collate_fn(batch):
        batch = list(filter(lambda x: x is not None, batch))
        if not batch:
            return [], []
        return tuple(zip(*batch))

    def _get_model(self, model_name: str):
        logger.info(f"Initializing {model_name} model...")
        try:
            model = fasterrcnn_resnet50_fpn(num_classes=self.config.NUM_CLASSES)
            model = model.to(self.device)
            model.train()
            logger.info(f"Successfully initialized {model_name} model")
            return model
        except Exception as e:
            logger.error(f"Error initializing model {model_name}: {str(e)}")
            raise

    def _get_optimizer(self, model):
        return torch.optim.AdamW(
            model.parameters(),
            lr=self.config.LEARNING_RATE,
            weight_decay=self.config.WEIGHT_DECAY
        )

    def _get_scheduler(self, optimizer):
        return torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            patience=3,
            factor=0.1
        )

    def train_one_epoch(self, model, optimizer, scheduler, model_name, epoch):
        model.train()
        running_loss = 0.0
        start_time = time.time()

        for i, (images, targets) in enumerate(self.train_loader):
            try:
                torch.cuda.empty_cache()

                if not images or not targets:
                    continue

                images = [image.to(self.device) for image in images]
                targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets]

                with torch.amp.autocast('cuda'):
                    loss_dict = model(images, targets)
                    losses = sum(loss for loss in loss_dict.values())

                    if not torch.isfinite(losses):
                        logger.warning(f"Loss is {losses}, skipping batch")
                        continue

                optimizer.zero_grad(set_to_none=True)
                losses.backward()

                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)

                optimizer.step()
                if scheduler is not None:
                    scheduler.step()

                running_loss += losses.detach().cpu().item()

                if i % 10 == 0:
                    avg_loss = running_loss / (i + 1)
                    current_lr = optimizer.param_groups[0]['lr']
                    memory_stats = self.gpu_optimizer.get_memory_stats()
                    logger.info(
                        f"Epoch {epoch}, Iteration {i}, "
                        f"Loss: {avg_loss:.4f}, "
                        f"LR: {current_lr:.6f}, "
                        f"Memory: {memory_stats}"
                    )

                del images, targets, losses, loss_dict
                torch.cuda.empty_cache()

            except Exception as e:
                logger.error(f"Error in training: {str(e)}")
                continue

        elapsed_time = time.time() - start_time
        avg_loss = running_loss / len(self.train_loader)
        fps = len(self.train_loader.dataset) / elapsed_time

        return {
            'avg_loss': avg_loss,
            'fps': fps,
            'elapsed_time': elapsed_time
        }
    def _get_scheduler(self, optimizer):
        num_steps = len(self.train_loader) * self.config.NUM_EPOCHS
        warmup_steps = len(self.train_loader)

        def lr_lambda(step):
            if step < warmup_steps:
                return float(step) / float(max(1, warmup_steps))
            return 0.1 ** (step / num_steps)

        return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    def validate(self, model, val_loader):
        model.eval()
        total_loss = 0
        start_time = time.time()

        with torch.no_grad():
            for images, targets in val_loader:
                images = [image.to(self.device) for image in images]
                targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets]

                loss_dict = model(images, targets)
                total_loss += sum(loss for loss in loss_dict.values())

        elapsed_time = time.time() - start_time
        avg_loss = total_loss / len(val_loader)
        fps = len(val_loader.dataset) / elapsed_time

        return {
            'map': avg_loss.item(),  # Using loss as proxy for mAP
            'val_fps': fps,
            'elapsed_time': elapsed_time
        }

    def save_checkpoint(self, model, optimizer, scheduler, metrics, model_name, epoch, is_best=False):
        checkpoint_dir = os.path.join(self.config.MODEL_DIR, model_name)
        os.makedirs(checkpoint_dir, exist_ok=True)

        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'metrics': metrics,
        }

        if is_best:
            path = os.path.join(checkpoint_dir, 'model_best.pth')
            logger.info(f"Saving best model with metrics: {metrics}")
        else:
            path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
            logger.info(f"Saving checkpoint to {path}")

        torch.save(checkpoint, path)

    def train(self, model_name: str, train_loader, val_loader):
        logger.info(f"Starting training for model: {model_name}")
        self.train_loader = train_loader
        self.val_loader = val_loader

        model = self._get_model(model_name)
        optimizer = self._get_optimizer(model)
        scheduler = self._get_scheduler(optimizer)

        best_map = float('inf')
        metrics_summary = []

        for epoch in range(self.config.NUM_EPOCHS):
            logger.info(f"Starting epoch {epoch}")

            train_metrics = self.train_one_epoch(model, optimizer, scheduler, model_name, epoch)
            val_metrics = self.validate(model, val_loader)

            epoch_metrics = {
                'epoch': epoch,
                'train_loss': train_metrics['avg_loss'],
                'train_fps': train_metrics['fps'],
                'val_loss': val_metrics['map'],
                'val_fps': val_metrics['val_fps'],
                'learning_rate': optimizer.param_groups[0]['lr']
            }
            metrics_summary.append(epoch_metrics)

            logger.info("\n" + "="*50)
            logger.info(f"Epoch {epoch} Summary:")
            logger.info(f"Training Loss: {epoch_metrics['train_loss']:.4f}")
            logger.info(f"Training FPS: {epoch_metrics['train_fps']:.2f}")
            logger.info(f"Validation Loss: {epoch_metrics['val_loss']:.4f}")
            logger.info(f"Validation FPS: {epoch_metrics['val_fps']:.2f}")
            logger.info(f"Learning Rate: {epoch_metrics['learning_rate']:.6f}")
            logger.info("="*50 + "\n")

            if epoch_metrics['val_loss'] < best_map:
                best_map = epoch_metrics['val_loss']
                self.save_checkpoint(
                    model, optimizer, scheduler,
                    epoch_metrics, model_name, epoch,
                    is_best=True
                )

            if (epoch + 1) % self.config.SAVE_FREQ == 0:
                self.save_checkpoint(
                    model, optimizer, scheduler,
                    epoch_metrics, model_name, epoch
                )

            scheduler.step(epoch_metrics['val_loss'])

        logger.info("\nTraining Complete!")
        logger.info(f"Best Validation Loss: {best_map:.4f}")

        avg_metrics = {
            'train_loss': np.mean([m['train_loss'] for m in metrics_summary]),
            'train_fps': np.mean([m['train_fps'] for m in metrics_summary]),
            'val_loss': np.mean([m['val_loss'] for m in metrics_summary]),
            'val_fps': np.mean([m['val_fps'] for m in metrics_summary])
        }

        logger.info("\nAverage Metrics:")
        logger.info(f"Avg Training Loss: {avg_metrics['train_loss']:.4f}")
        logger.info(f"Avg Training FPS: {avg_metrics['train_fps']:.2f}")
        logger.info(f"Avg Validation Loss: {avg_metrics['val_loss']:.4f}")
        logger.info(f"Avg Validation FPS: {avg_metrics['val_fps']:.2f}")

        return metrics_summary

Overwriting /kaggle/working/trainer.py


In [None]:
%%writefile /kaggle/working/transforms.py
"""
Transforms for object detection
"""
import torchvision.transforms as T

class TransformFactory:
    @staticmethod
    def get_transforms(is_train: bool = True):
        """Get transforms for training or validation

        Args:
            is_train: If True, return training transforms, else validation transforms

        Returns:
            torchvision.transforms.Compose object
        """
        if is_train:
            return T.Compose([
                T.ToTensor(),
                T.RandomHorizontalFlip(0.5),
                T.Normalize(
                    mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225]
                )
            ])
        else:
            return T.Compose([
                T.ToTensor(),
                T.Normalize(
                    mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225]
                )
            ])

Writing /kaggle/working/transforms.py


In [None]:
%%writefile /kaggle/working/config.py
# config.py - Configuration settings
from dataclasses import dataclass
from typing import Optional, Dict, Any

@dataclass
class TrainingConfig:
    # Data paths
    TRAIN_IMG_DIR = '/kaggle/working/HAD-v2-8/train'
    TRAIN_ANNOTATIONS = '/kaggle/working/HAD-v2-8/train/_annotations.coco.json'
    VAL_IMG_DIR = '/kaggle/working/HAD-v2-8/valid'
    VAL_ANNOTATIONS = '/kaggle/working/HAD-v2-8/valid/_annotations.coco.json'

    # Training parameters
    NUM_CLASSES = 8  # 7 classes + background
    BATCH_SIZE = 4
    NUM_WORKERS = 2
    DEVICE = 'cuda'
    LOG_FILE = 'training.log'
    SAVE_FREQ = 5      # Save checkpoint every N epochs
    MODEL_DIR = 'models'

    # Model configurations
    BACKBONE = 'resnet50'
    LEARNING_RATE = 0.0001
    NUM_EPOCHS = 10
    WEIGHT_DECAY = 0.0005

# # dataset.py - Dataset handling
# from dataset import COCODetectionDataset  # Using the code from your project files

# # transforms.py - Data transformations
# from transforms import TransformFactory  # Using the code from your project files

# # trainer.py - Training implementation
# from trainer import ObjectDetectionTrainer  # Using the code from your project files

# # GPU_optimizer.py - GPU optimization
# from GPU_optimizer import GPUOptimizer  # Using the code from your project files

Writing /kaggle/working/config.py


In [None]:
!pip install pycocotools

Collecting pycocotools
  Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (427 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.8/427.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pycocotools
Successfully installed pycocotools-2.0.8


In [None]:
!pip install loguru


Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.2


In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.32-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.11-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.32-py3-none-any.whl (887 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.0/887.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading ultralytics_thop-2.0.11-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.32 ultralytics-thop-2.0.11


In [None]:
import torch
from torch.utils.data import DataLoader
import torchvision.transforms as T
from config import TrainingConfig
from dataset import COCODetectionDataset
from trainer import ObjectDetectionTrainer

def main():
    # Initialize config
    config = TrainingConfig()

    # Set up transforms
    transform = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406],
                   std=[0.229, 0.224, 0.225])
    ])

    # Create datasets
    train_dataset = COCODetectionDataset(
        root=config.TRAIN_IMG_DIR,
        annotation_file=config.TRAIN_ANNOTATIONS,
        transforms=transform
    )

    val_dataset = COCODetectionDataset(
        root=config.VAL_IMG_DIR,
        annotation_file=config.VAL_ANNOTATIONS,
        transforms=transform
    )

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        num_workers=config.NUM_WORKERS,
        collate_fn=ObjectDetectionTrainer.collate_fn
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        num_workers=config.NUM_WORKERS,
        collate_fn=ObjectDetectionTrainer.collate_fn
    )

    # Initialize trainer
    trainer = ObjectDetectionTrainer(config)

    # Train model
    try:
        metrics = trainer.train(
            model_name='fasterrcnn',
            train_loader=train_loader,
            val_loader=val_loader
        )
        print("Training completed successfully!")

    except Exception as e:
        print(f"Error during training: {str(e)}")

# if __name__ == "__main__":
#     main()

In [None]:
main()

loading annotations into memory...


[32m2024-11-18 15:39:15.803[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m39[0m - [1mLoaded dataset with 8122 valid images and 7 classes[0m
[32m2024-11-18 15:39:15.804[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m40[0m - [1mClasses: ['bus', 'car', 'crosswalk', 'person', 'stop sign', 'traffic light', 'truck'][0m
[32m2024-11-18 15:39:15.846[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m39[0m - [1mLoaded dataset with 744 valid images and 7 classes[0m
[32m2024-11-18 15:39:15.847[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m40[0m - [1mClasses: ['bus', 'car', 'crosswalk', 'person', 'stop sign', 'traffic light', 'truck'][0m
`torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
[32m2024-11-18 15:39:15.912[0m | [1mINFO    [0m | [36mtrainer[0m:[36m_setup_logging[0m:[36m25[0m - [1mUsing device: cuda[0m
[32m2024-11-18 15:39:15.913[0m 

Done (t=0.46s)
creating index...
index created!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 175MB/s] 
[32m2024-11-18 15:39:17.742[0m | [1mINFO    [0m | [36mtrainer[0m:[36m_get_model[0m:[36m40[0m - [1mSuccessfully initialized fasterrcnn model[0m
[32m2024-11-18 15:39:17.745[0m | [1mINFO    [0m | [36mtrainer[0m:[36mtrain[0m:[36m190[0m - [1mStarting epoch 0[0m
[32m2024-11-18 15:39:20.951[0m | [1mINFO    [0m | [36mtrainer[0m:[36mtrain_one_epoch[0m:[36m99[0m - [1mEpoch 0, Iteration 0, Loss: 2.7994, LR: 0.000000, Memory: {'allocated': '699.1MB', 'cached': '5430.0MB', 'max_allocated': '4809.9MB'}[0m
[32m2024-11-18 15:39:27.491[0m | [1mINFO    [0m | [36mtrainer[0m:[36mtrain_one_epoch[0m:[36m99[0m - [1mEpoch 0, Iteration 10, Loss: 2.6965, LR: 0.000001, Memory: {'allocated': '699.2MB', 'cached': '5866.0MB', 'max_allocated': '5304.8MB'}[0m
[32m2024-11-18 15:39:33.975