# Electrical Symbol Detection - Training on Google Colab
Train ResNet50+FPN model with CIoU Loss for multi-class object detection

## 1. Setup Environment

In [1]:
import sys
import torch

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU: Not available (will use CPU)")

Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch version: 2.9.0+cu128
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB


## 2. Mount Google Drive (for saving checkpoints)

In [2]:
import sys
import os

# Check if running on Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=False)
        print("‚úì Google Drive mounted at /content/drive")
    except Exception as e:
        print(f"‚ö† Could not mount Google Drive: {e}")
        print("Proceeding without Drive - checkpoints will save locally in /content/")
else:
    print("‚ö† Running locally (not on Google Colab)")
    print("Dataset will be saved locally")

Mounted at /content/drive
‚úì Google Drive mounted at /content/drive


## 3. Clone Repository

In [8]:
import os
import subprocess

repo_path = '/content/symbol-detection'

if not os.path.exists(repo_path):
    subprocess.run(['git', 'clone', 'https://github.com/BhanukaDev/symbol-detection.git', repo_path], check=True)
    print(f"Repository cloned to {repo_path}")
else:
    print(f"Repository already exists at {repo_path}")
    os.chdir(repo_path)
    subprocess.run(['git', 'pull'], check=True)
    print("Repository updated")

Repository already exists at /content/symbol-detection
Repository updated


## 4. Install Dependencies

In [14]:
import os

# Change to python directory if in Colab
if os.path.exists('/content/symbol-detection/python'):
    os.chdir('/content/symbol-detection/python')
    
    # Install local workspace packages first
    print("Installing local workspace packages...")
    !pip install -e ./floor-grid
    !pip install -e ./effects
    print("‚úì Workspace packages installed")

# Install external dependencies
!pip install torch torchvision torchmetrics pycocotools timm

# Install main package
!pip install -e .

Installing local workspace packages...
Obtaining file:///content/symbol-detection/python/floor-grid
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: floor-grid
  Building editable for floor-grid (pyproject.toml) ... [?25l[?25hdone
  Created wheel for floor-grid: filename=floor_grid-0.1.0-py3-none-any.whl size=1196 sha256=78699117b6f3e43fc7361415f221b0fc1e7cda53ed8d7db66595655680a66a57
  Stored in directory: /tmp/pip-ephem-wheel-cache-gx8ub21l/wheels/9e/6b/f6/93c9e88f3c6f9856769f5b99711035582c95144db05c26c467
Successfully built floor-grid
Installing collected packages: floor-grid
  Attempting uninstall: floor-grid
    Found existing installation: floor-grid 0.1.0
    Uninstalling floor-grid-0.1.0:
      Successfully uninstalled floor-grid-0.1

## 5. Verify Installation

In [10]:
import sys
import os

# Ensure we're in the right directory
if os.path.exists('/content/symbol-detection/python'):
    os.chdir('/content/symbol-detection/python')
    sys.path.insert(0, '/content/symbol-detection/python/src')

try:
    from symbol_detection.training import Trainer, CIoULoss
    from symbol_detection.dataset.generator import COCODatasetGenerator
    
    print("‚úì symbol-detection package imported successfully")
    print("‚úì Trainer available")
    print("‚úì CIoU Loss available")
    print("‚úì COCODatasetGenerator available")
except ImportError as e:
    print(f"‚úó Import failed: {e}")
    print("\nReinstalling package...")
    os.chdir('/content/symbol-detection/python')
    !pip install -e .
    print("Please re-run this cell after installation completes.")

‚úó Import failed: cannot import name '_center' from 'numpy._core.umath' (/usr/local/lib/python3.12/dist-packages/numpy/_core/umath.py)

Reinstalling package...
Obtaining file:///content/symbol-detection/python
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: symbol-detection
  Building editable for symbol-detection (pyproject.toml) ... [?25l[?25hdone
  Created wheel for symbol-detection: filename=symbol_detection-0.1.0-0.editable-py3-none-any.whl size=1791 sha256=0b68963c90ddd0ab19472ca1d87db1775ed74551726790e64a56eebff835a67f
  Stored in directory: /tmp/pip-ephem-wheel-cache-jn022hg8/wheels/94/36/75/5aa7df0c2e953f991eb4aa945f3eb59d6faa6eeb9272dcd759
Successfully built symbol-detection
Installing collected packages: symbol-detection
  Attem

Please re-run this cell after installation completes.


## 6. Mount Dataset Location

In [11]:
import os
import sys
from pathlib import Path

# Determine paths based on environment
IN_COLAB = 'google.colab' in sys.modules
DRIVE_MOUNTED = os.path.exists('/content/drive/MyDrive') if IN_COLAB else False

if IN_COLAB and DRIVE_MOUNTED:
    # Save to Google Drive
    dataset_dir = '/content/drive/MyDrive/symbol-detection/dataset'
    checkpoints_dir = '/content/drive/MyDrive/symbol-detection/checkpoints'
    print("‚úì Using Google Drive for storage")
elif IN_COLAB:
    # Fallback to temporary Colab storage
    dataset_dir = '/content/symbol-detection/dataset'
    checkpoints_dir = '/content/symbol-detection/checkpoints'
    print("‚ö† Google Drive not mounted - using temporary Colab storage")
else:
    # Local development
    dataset_dir = str(Path.cwd().parent.parent / 'python' / 'dataset')
    checkpoints_dir = str(Path.cwd().parent.parent / 'python' / 'checkpoints')
    print("üìÅ Using local storage")

os.makedirs(dataset_dir, exist_ok=True)
os.makedirs(checkpoints_dir, exist_ok=True)

print(f"Dataset directory: {dataset_dir}")
print(f"Checkpoints directory: {checkpoints_dir}")
print(f"Dataset exists: {os.path.exists(os.path.join(dataset_dir, 'annotations.json'))}")

‚úì Using Google Drive for storage
Dataset directory: /content/drive/MyDrive/symbol-detection/dataset
Checkpoints directory: /content/drive/MyDrive/symbol-detection/checkpoints
Dataset exists: True


## 7. Generate Dataset (Optional - if not using pre-generated)

In [15]:
import os
import sys

# Ensure we are in the collection root
os.chdir('/content/symbol-detection/python')

# Manually add library paths to ensure modules are found even if pip install -e isn't fully active in this session
sys.path.insert(0, '/content/symbol-detection/python/floor-grid/src')
sys.path.insert(0, '/content/symbol-detection/python/effects/src')
sys.path.insert(0, '/content/symbol-detection/python/src')

from symbol_detection.dataset.generator import COCODatasetGenerator

print("Generating dataset (200 images)...")

generator = COCODatasetGenerator(
    output_dir=dataset_dir,
    symbols_dir='data/electrical-symbols',
)

# Use the built-in generator with proper COCO annotation conversion
generator.generate_dataset(
    num_images=300,
    rows=(15, 30),              # min, max rows
    cols=(15, 30),              # min, max columns
    cell_size=(20, 25),         # min, max cell size
    apply_symbol_effects=False, # Skip slow effects
    apply_image_effects=True,   # Keep image effects
)

# Save annotations to disk
generator.save_annotations()

num_images = len(os.listdir(os.path.join(dataset_dir, 'images')))
print(f"‚úì Dataset generated: {num_images} images")
print(f"‚úì Annotations saved: {dataset_dir}/annotations.json")

Generating dataset (200 images)...
Generating 300 dataset images with varied dimensions...
  - Rows range: 15 to 30
  - Cols range: 15 to 30
  - Cell size range: 20 to 25 pixels
  - Symbol effects: disabled
  - Image effects: enabled
Loaded 7 symbol classes:
  - Junction Box: 1 variant(s)
  - Two-way switch: 1 variant(s)
  - Single-pole, one-way switch: 1 variant(s)
  - Light: 1 variant(s)
  - Three-pole, one-way switch: 1 variant(s)
  - Two-pole, one-way switch: 1 variant(s)
  - Duplex Receptacle: 1 variant(s)
[1/300] Generated floor_plan_0000.png (16x26) - 2 rooms, 6 symbols
Loaded 7 symbol classes:
  - Junction Box: 1 variant(s)
  - Two-way switch: 1 variant(s)
  - Single-pole, one-way switch: 1 variant(s)
  - Light: 1 variant(s)
  - Three-pole, one-way switch: 1 variant(s)
  - Two-pole, one-way switch: 1 variant(s)
  - Duplex Receptacle: 1 variant(s)
[2/300] Generated floor_plan_0001.png (21x16) - 2 rooms, 6 symbols
Loaded 7 symbol classes:
  - Junction Box: 1 variant(s)
  - Two-wa

## 8. Training Configuration

In [None]:
# Training hyperparameters
# Reduced batch size to 8 to avoid OOM errors (A100 has 40GB but shared environment can be tricky)
training_config = {
    'num_epochs': 50,       # Full training
    'batch_size': 8,        # Reduced from 12 to prevent CUDA OOM
    'learning_rate': 0.005,
    'num_classes': 7,       # Electrical symbols
    'use_ciou_loss': True,  # Complete IoU Loss per paper
    'eval_every_n': 10,     # Evaluate AP metrics every 10 epochs
    'enable_ap_eval': True, # Enable AP50/AP75/mAP evaluation
}

print("Training Configuration:")
for key, value in training_config.items():
    print(f"  {key}: {value}")

import os
# Set allocator config to reduce fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print(f"\n‚úì Batch size set to {training_config['batch_size']} to ensure stability")
print("‚úì Configured PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True")

Training Configuration:
  num_epochs: 50
  batch_size: 12
  learning_rate: 0.005
  num_classes: 7
  use_ciou_loss: True
  eval_every_n: 10
  enable_ap_eval: True

‚úì A100 GPU selected - using batch_size=12 for optimal performance
‚úì AP evaluation enabled every 10 epochs


## 9. Run Training

In [None]:
import importlib
import symbol_detection.training.trainer
importlib.reload(symbol_detection.training.trainer)

from symbol_detection.training import Trainer  # Now with AP evaluation
import torch
import gc

# Free up memory before starting
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    print(f"Pre-training memory cleared. Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB")

# Initialize trainer with AP evaluation enabled
trainer = Trainer(
    dataset_dir=dataset_dir,
    output_dir=checkpoints_dir,
    num_classes=training_config['num_classes'],
    batch_size=training_config['batch_size'],
    learning_rate=training_config['learning_rate'],
    num_epochs=training_config['num_epochs'],
    device='cuda' if torch.cuda.is_available() else 'cpu',
    use_ciou_loss=training_config['use_ciou_loss'],
    eval_every_n=training_config['eval_every_n'],
    enable_ap_eval=training_config['enable_ap_eval'],
)

print(f"Trainer initialized on device: {trainer.device}")
print(f"Model: FasterRCNN with ResNet50+FPN backbone")
print(f"CIoU Loss: {training_config['use_ciou_loss']}")
print(f"AP Evaluation: Every {training_config['eval_every_n']} epochs")

Using device: cuda
Trainer initialized on device: cuda
Model: FasterRCNN with ResNet50+FPN backbone
CIoU Loss: True
AP Evaluation: Every 10 epochs


In [None]:
import json

# Verify annotations have bbox field before training
ann_file = f'{dataset_dir}/annotations.json'
with open(ann_file, 'r') as f:
    data = json.load(f)

print(f"Checking annotations format...")
print(f"  - Total images: {len(data.get('images', []))}")
print(f"  - Total annotations: {len(data.get('annotations', []))}")
print(f"  - Categories: {len(data.get('categories', []))}")

if data.get('annotations'):
    first_ann = data['annotations'][0]
    print(f"\nFirst annotation sample:")
    print(f"  - Keys: {list(first_ann.keys())}")
    if 'bbox' in first_ann:
        print(f"  - bbox: {first_ann['bbox']} ‚úì")
    else:
        print(f"  - bbox: MISSING ‚úó")
        print(f"\n‚ö† ERROR: annotations.json does not have 'bbox' field!")
        print(f"Solution: Please re-run cell 7 (Dataset Generation) to regenerate with proper COCO format")
else:
    print("‚ö† No annotations found in JSON")

import torch

# Patch the validate method to fix loss computation
def fixed_validate(self, val_loader):
    self.model.train()  # Keep in train mode, disable gradients
    total_loss = 0.0
    
    with torch.no_grad():
        for images, targets in val_loader:
            images = [img.to(self.device) for img in images]
            targets = [{
                'boxes': t['boxes'].to(self.device),
                'labels': t['labels'].to(self.device),
            } for t in targets]
            
            loss_dict = self.model(images, targets)
            if isinstance(loss_dict, dict):
                losses = sum(loss_dict.values(), torch.tensor(0.0, device=self.device))
            else:
                losses = torch.tensor(0.0, device=self.device)
            
            total_loss += losses.item()
    
    avg_loss = total_loss / len(val_loader)
    self.val_losses.append(avg_loss)
    return avg_loss

# Apply patch
from symbol_detection.training.trainer import Trainer
Trainer.validate = fixed_validate
trainer.validate = fixed_validate.__get__(trainer, Trainer)

print("‚úì Trainer validate() method patched")

Checking annotations format...
  - Total images: 300
  - Total annotations: 2300
  - Categories: 7

First annotation sample:
  - Keys: ['id', 'image_id', 'category_id', 'bbox', 'area', 'iscrowd']
  - bbox: [93.0, 124.0, 31.0, 47.0] ‚úì
‚úì Trainer validate() method patched


In [None]:
# Start training
try:
    trainer.train()
    print("\n‚úì Training completed successfully")
except Exception as e:
    print(f"‚úó Training failed: {e}")
    import traceback
    traceback.print_exc()

Training for 50 epochs...
Training samples: 240, Validation samples: 60
AP evaluation every 10 epochs

‚úó Training failed: CUDA out of memory. Tried to allocate 138.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 79.81 MiB is free. Process 5927 has 28.02 GiB memory in use. Process 44013 has 1.78 GiB memory in use. Including non-PyTorch memory, this process has 9.60 GiB memory in use. Of the allocated memory 8.57 GiB is allocated by PyTorch, and 557.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Traceback (most recent call last):
  File "/tmp/ipython-input-1712165534.py", line 3, in <cell line: 0>
    trainer.train()
  File "/content/symbol-detection/python/src/symbol_detection/training/trainer.py", line 238, in train
    train_loss = self.train_epoch(train_loader)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/symbol-detection/python/src/symbol_detection/training/trainer.py", line 153, in train_epoch
    loss_dict = self.model(images, targets)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torchvision/models/detection/generalized_rcnn.py",

## 10. Visualize Training Metrics

In [None]:
import matplotlib.pyplot as plt
import json
from pathlib import Path

metrics_file = Path(checkpoints_dir) / 'metrics.json'

if metrics_file.exists():
    with open(metrics_file, 'r') as f:
        metrics = json.load(f)
    
    plt.figure(figsize=(10, 6))
    plt.plot(metrics['train_losses'], label='Train Loss', marker='o')
    plt.plot(metrics['val_losses'], label='Validation Loss', marker='s')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Progress')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{checkpoints_dir}/training_curve.png', dpi=150)
    plt.show()
    
    print(f"Final train loss: {metrics['train_losses'][-1]:.4f}")
    print(f"Final val loss: {metrics['val_losses'][-1]:.4f}")
else:
    print("Metrics file not found. Training may not have completed.")

Metrics file not found. Training may not have completed.


## 11. List Saved Checkpoints

In [None]:
from pathlib import Path

checkpoints = list(Path(checkpoints_dir).glob('*.pth'))

if checkpoints:
    print(f"Saved checkpoints ({len(checkpoints)}):")
    for ckpt in sorted(checkpoints):
        size_mb = ckpt.stat().st_size / (1024 * 1024)
        print(f"  {ckpt.name} ({size_mb:.1f} MB)")
    print(f"\nLatest checkpoint: {max(checkpoints, key=lambda x: x.stat().st_mtime).name}")
else:
    print("No checkpoints found")

No checkpoints found


## 12. Download Best Model (Optional)

In [None]:
# The models are already saved in Google Drive (/content/drive/MyDrive/symbol-detection/checkpoints/)
# You can download them directly from Google Drive or use the Colab files interface

print(f"Checkpoints saved to: {checkpoints_dir}")
print("You can download them from Google Drive or use the Colab Files panel on the left")

Checkpoints saved to: /content/drive/MyDrive/symbol-detection/checkpoints
You can download them from Google Drive or use the Colab Files panel on the left
