## 1. Setup and Imports

In [1]:
import os
import json
import random
from pathlib import Path
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torchvision.transforms as transforms

# Set seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

Using device: cuda
PyTorch version: 2.8.0+cu126


## 2. Configuration

In [2]:
@dataclass
class Config:
    # Data paths
    synthetic_images: str = "/kaggle/input/med-synthetic-dataset/kaggle/working/dataset/images"
    synthetic_labels: str = "/kaggle/input/med-synthetic-dataset/kaggle/working/dataset/labels"
    
    real_images: str = "/kaggle/input/med-real-dataset/dataset/images"
    real_labels: str = "/kaggle/input/med-real-dataset/dataset/labels"
    
    # Crop parameters
    crop_width: int = 128      # Width of crop around data point
    crop_height: int = 384     # Height of crop (more vertical space for error bars)
    max_offset: float = 280.0  # Maximum expected error bar length (for normalization)
    
    # Model parameters
    backbone: str = "resnet18"  # resnet18, resnet34
    pretrained: bool = True
    
    # Training parameters
    batch_size: int = 32
    num_epochs_pretrain: int = 5
    num_epochs_finetune: int = 20
    learning_rate: float = 1e-3
    weight_decay: float = 1e-4
    
    # Data split
    train_split: float = 0.95   # 95% of synthetic for training
    real_train_split: float = 0.8  # 80% of real for fine-tuning
    
    # Output
    checkpoint_dir: str = "checkpoints"
    results_dir: str = "results"

cfg = Config()
os.makedirs(cfg.checkpoint_dir, exist_ok=True)
os.makedirs(cfg.results_dir, exist_ok=True)

print(f"Configuration:")
print(f"  Crop size: {cfg.crop_width}×{cfg.crop_height}")
print(f"  Backbone: {cfg.backbone}")
print(f"  Batch size: {cfg.batch_size}")
print(f"  Device: {device}")

Configuration:
  Crop size: 128×384
  Backbone: resnet18
  Batch size: 32
  Device: cuda


## 3. Dataset Class

In [3]:
class ErrorBarDataset(Dataset):
    """
    Dataset for error bar detection.
    
    Returns crops around each data point with ground truth offsets.
    """
    
    def __init__(
        self,
        image_ids: List[str],
        images_dir: str,
        labels_dir: str,
        crop_width: int = 128,
        crop_height: int = 384,
        max_offset: float = 280.0,
        transform=None,
        augment: bool = False
    ):
        self.image_ids = image_ids
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.crop_width = crop_width
        self.crop_height = crop_height
        self.max_offset = max_offset
        self.transform = transform
        self.augment = augment
        
        # Build index of all data points
        self.samples = []
        self._build_index()
        
        print(f"Dataset: {len(self.samples)} data points from {len(image_ids)} images")
    
    def _build_index(self):
        """Build index of all data points across all images."""
        for img_id in self.image_ids:
            label_path = os.path.join(self.labels_dir, f"{img_id}.json")
            if not os.path.exists(label_path):
                continue
            
            with open(label_path, 'r') as f:
                label = json.load(f)
            
            # Extract all actual data points (not anchor points)
            for line in label:
                for point in line["points"]:
                    if point.get("label", "") == "":  # actual data point
                        self.samples.append({
                            "image_id": img_id,
                            "x": float(point["x"]),
                            "y": float(point["y"]),
                            "dy_up": float(point.get("topBarPixelDistance", 0)),
                            "dy_down": float(point.get("bottomBarPixelDistance", 0))
                        })
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # Load image
        img_path = os.path.join(self.images_dir, f"{sample['image_id']}.png")
        img = Image.open(img_path).convert('RGB')
        img_w, img_h = img.size
        
        # Extract crop around data point
        x, y = int(sample['x']), int(sample['y'])
        
        # Crop boundaries (centered on data point)
        x1 = max(0, x - self.crop_width // 2)
        x2 = min(img_w, x + self.crop_width // 2)
        y1 = max(0, y - self.crop_height // 2)
        y2 = min(img_h, y + self.crop_height // 2)
        
        crop = img.crop((x1, y1, x2, y2))
        
        # Pad if crop is too small (near edges)
        if crop.size != (self.crop_width, self.crop_height):
            padded = Image.new('RGB', (self.crop_width, self.crop_height), (255, 255, 255))
            paste_x = (self.crop_width - crop.size[0]) // 2
            paste_y = (self.crop_height - crop.size[1]) // 2
            padded.paste(crop, (paste_x, paste_y))
            crop = padded
        
        # Apply transforms
        if self.transform:
            crop = self.transform(crop)
        
        # Ground truth offsets (normalized)
        dy_up = sample['dy_up'] / self.max_offset
        dy_down = sample['dy_down'] / self.max_offset
        target = torch.tensor([dy_up, dy_down], dtype=torch.float32)
        
        return crop, target

print("Dataset class defined")

Dataset class defined


## 4. Model Architecture

In [4]:
class ErrorBarRegressor(nn.Module):
    """
    CNN-based regressor for error bar endpoint detection.
    
    Architecture:
    - ResNet backbone (pretrained on ImageNet)
    - Global average pooling
    - FC layers → 2 outputs (dy_up, dy_down)
    """
    
    def __init__(self, backbone='resnet18', pretrained=True):
        super().__init__()
        
        # Load pretrained backbone
        if backbone == 'resnet18':
            resnet = models.resnet18(pretrained=pretrained)
            feat_dim = 512
        elif backbone == 'resnet34':
            resnet = models.resnet34(pretrained=pretrained)
            feat_dim = 512
        else:
            raise ValueError(f"Unknown backbone: {backbone}")
        
        # Remove final FC layer
        self.backbone = nn.Sequential(*list(resnet.children())[:-1])
        
        # Regression head
        self.regressor = nn.Sequential(
            nn.Linear(feat_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 2)  # (dy_up, dy_down)
        )
    
    def forward(self, x):
        # Extract features
        features = self.backbone(x)  # (batch, feat_dim, 1, 1)
        features = features.view(features.size(0), -1)  # (batch, feat_dim)
        
        # Regress offsets
        offsets = self.regressor(features)  # (batch, 2)
        return offsets

# Test model
model = ErrorBarRegressor(backbone=cfg.backbone, pretrained=cfg.pretrained)
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model: {cfg.backbone}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Test forward pass
dummy_input = torch.randn(2, 3, cfg.crop_height, cfg.crop_width).to(device)
dummy_output = model(dummy_input)
print(f"Output shape: {dummy_output.shape}  # (batch_size, 2)")



Model: resnet18
Total parameters: 11,324,418
Trainable parameters: 11,324,418
Output shape: torch.Size([2, 2])  # (batch_size, 2)


## 5. Data Loading

In [5]:
# Transforms
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])  # ImageNet stats
])

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
])

# Get all image IDs
synthetic_ids = sorted([
    os.path.splitext(f)[0]
    for f in os.listdir(cfg.synthetic_images)
    if f.endswith('.png')
])

real_ids = sorted([
    os.path.splitext(f)[0]
    for f in os.listdir(cfg.real_images)
    if f.endswith('.png')
])

print(f"Total synthetic images: {len(synthetic_ids)}")
print(f"Total real images: {len(real_ids)}")

# Split synthetic into train/val
random.shuffle(synthetic_ids)
split_idx = int(len(synthetic_ids) * cfg.train_split)
synth_train_ids = synthetic_ids[:split_idx]
synth_val_ids = synthetic_ids[split_idx:]

# Split real into train/test
random.shuffle(real_ids)
split_idx = int(len(real_ids) * cfg.real_train_split)
real_train_ids = real_ids[:split_idx]
real_test_ids = real_ids[split_idx:]

print(f"\nData splits:")
print(f"  Synthetic train: {len(synth_train_ids)} images")
print(f"  Synthetic val:   {len(synth_val_ids)} images")
print(f"  Real train:      {len(real_train_ids)} images (for fine-tuning)")
print(f"  Real test:       {len(real_test_ids)} images (held-out)")

# Create datasets
train_dataset = ErrorBarDataset(
    synth_train_ids,
    cfg.synthetic_images,
    cfg.synthetic_labels,
    cfg.crop_width,
    cfg.crop_height,
    cfg.max_offset,
    train_transform,
    augment=True
)

val_dataset = ErrorBarDataset(
    synth_val_ids,
    cfg.synthetic_images,
    cfg.synthetic_labels,
    cfg.crop_width,
    cfg.crop_height,
    cfg.max_offset,
    val_transform
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=cfg.batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=cfg.batch_size,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print(f"\nDataloaders ready:")
print(f"  Train: {len(train_loader)} batches")
print(f"  Val:   {len(val_loader)} batches")

Total synthetic images: 2850
Total real images: 150

Data splits:
  Synthetic train: 2707 images
  Synthetic val:   143 images
  Real train:      120 images (for fine-tuning)
  Real test:       30 images (held-out)
Dataset: 68961 data points from 2707 images
Dataset: 3562 data points from 143 images

Dataloaders ready:
  Train: 2156 batches
  Val:   112 batches


## 6. Training Functions

In [6]:
def train_epoch(model, loader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    
    pbar = tqdm(loader, desc='Training')
    for images, targets in pbar:
        images = images.to(device)
        targets = targets.to(device)
        
        # Forward
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        
        # Backward
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(loader)


def validate(model, loader, criterion, device, max_offset):
    """Validate model."""
    model.eval()
    total_loss = 0
    errors_up = []
    errors_down = []
    
    with torch.no_grad():
        for images, targets in tqdm(loader, desc='Validation'):
            images = images.to(device)
            targets = targets.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            
            # Denormalize and compute errors
            pred_up = outputs[:, 0].cpu().numpy() * max_offset
            pred_down = outputs[:, 1].cpu().numpy() * max_offset
            gt_up = targets[:, 0].cpu().numpy() * max_offset
            gt_down = targets[:, 1].cpu().numpy() * max_offset
            
            errors_up.extend(np.abs(pred_up - gt_up))
            errors_down.extend(np.abs(pred_down - gt_down))
    
    # Compute metrics
    errors = np.array(errors_up + errors_down)
    metrics = {
        'loss': total_loss / len(loader),
        'mean_error': float(errors.mean()),
        'median_error': float(np.median(errors)),
        'pct_within_2px': float((errors <= 2).mean()),
        'pct_within_5px': float((errors <= 5).mean()),
        'pct_within_10px': float((errors <= 10).mean())
    }
    
    return metrics

print("Training functions defined")

Training functions defined


## 7. Train on Synthetic Data

In [7]:
# Initialize model
model = ErrorBarRegressor(backbone=cfg.backbone, pretrained=cfg.pretrained)
model = model.to(device)

# Loss and optimizer
criterion = nn.SmoothL1Loss()  # Robust to outliers
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=cfg.learning_rate,
    weight_decay=cfg.weight_decay
)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=cfg.num_epochs_pretrain
)

print("Starting training on synthetic data...\n")

best_val_loss = float('inf')
train_losses = []
val_losses = []

for epoch in range(cfg.num_epochs_pretrain):
    print(f"\nEpoch {epoch+1}/{cfg.num_epochs_pretrain}")
    print("-" * 50)
    
    # Train
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)
    
    # Validate
    val_metrics = validate(model, val_loader, criterion, device, cfg.max_offset)
    val_losses.append(val_metrics['loss'])
    
    # Update learning rate
    scheduler.step()
    
    # Print metrics
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss:   {val_metrics['loss']:.4f}")
    print(f"Mean Error: {val_metrics['mean_error']:.2f}px")
    print(f"Median Error: {val_metrics['median_error']:.2f}px")
    print(f"Within 2px:  {val_metrics['pct_within_2px']*100:.1f}%")
    print(f"Within 5px:  {val_metrics['pct_within_5px']*100:.1f}%")
    print(f"Within 10px: {val_metrics['pct_within_10px']*100:.1f}%")
    
    # Save best model
    if val_metrics['loss'] < best_val_loss:
        best_val_loss = val_metrics['loss']
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_metrics': val_metrics
        }, os.path.join(cfg.checkpoint_dir, 'best_synthetic.pth'))
        print("✓ Saved best model")

print("\n" + "="*50)
print("Training on synthetic data complete!")
print(f"Best validation loss: {best_val_loss:.4f}")

Starting training on synthetic data...


Epoch 1/5
--------------------------------------------------


Training: 100%|██████████| 2156/2156 [07:30<00:00,  4.78it/s, loss=0.00186] 
Validation: 100%|██████████| 112/112 [00:22<00:00,  5.08it/s]


Train Loss: 0.0024
Val Loss:   0.0024
Mean Error: 12.83px
Median Error: 7.37px
Within 2px:  16.8%
Within 5px:  39.1%
Within 10px: 57.9%
✓ Saved best model

Epoch 2/5
--------------------------------------------------


Training: 100%|██████████| 2156/2156 [07:29<00:00,  4.80it/s, loss=0.000142]
Validation: 100%|██████████| 112/112 [00:21<00:00,  5.10it/s]


Train Loss: 0.0020
Val Loss:   0.0015
Mean Error: 8.65px
Median Error: 4.42px
Within 2px:  23.2%
Within 5px:  59.2%
Within 10px: 75.4%
✓ Saved best model

Epoch 3/5
--------------------------------------------------


Training: 100%|██████████| 2156/2156 [07:29<00:00,  4.79it/s, loss=4.47e-6] 
Validation: 100%|██████████| 112/112 [00:22<00:00,  5.08it/s]


Train Loss: 0.0018
Val Loss:   0.0014
Mean Error: 8.48px
Median Error: 4.76px
Within 2px:  25.1%
Within 5px:  57.8%
Within 10px: 77.3%
✓ Saved best model

Epoch 4/5
--------------------------------------------------


Training: 100%|██████████| 2156/2156 [07:29<00:00,  4.79it/s, loss=0.000225]
Validation: 100%|██████████| 112/112 [00:21<00:00,  5.12it/s]


Train Loss: 0.0017
Val Loss:   0.0013
Mean Error: 8.21px
Median Error: 4.95px
Within 2px:  24.5%
Within 5px:  50.1%
Within 10px: 77.6%
✓ Saved best model

Epoch 5/5
--------------------------------------------------


Training: 100%|██████████| 2156/2156 [07:32<00:00,  4.77it/s, loss=0.000192]
Validation: 100%|██████████| 112/112 [00:22<00:00,  5.04it/s]

Train Loss: 0.0015
Val Loss:   0.0013
Mean Error: 8.08px
Median Error: 4.74px
Within 2px:  24.9%
Within 5px:  58.9%
Within 10px: 78.7%

Training on synthetic data complete!
Best validation loss: 0.0013





## 8. Fine-tune on Real Data

In [8]:
# Load best synthetic model  
checkpoint = torch.load(os.path.join(cfg.checkpoint_dir, 'best_synthetic.pth'))
model.load_state_dict(checkpoint['model_state_dict'])
print("Loaded best synthetic model")

# Create real datasets
real_train_dataset = ErrorBarDataset(
    real_train_ids,
    cfg.real_images,
    cfg.real_labels,
    cfg.crop_width,
    cfg.crop_height,
    cfg.max_offset,
    train_transform,
    augment=True
)

real_test_dataset = ErrorBarDataset(
    real_test_ids,
    cfg.real_images,
    cfg.real_labels,
    cfg.crop_width,
    cfg.crop_height,
    cfg.max_offset,
    val_transform
)

real_train_loader = DataLoader(
    real_train_dataset,
    batch_size=cfg.batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

real_test_loader = DataLoader(
    real_test_dataset,
    batch_size=cfg.batch_size,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

# New optimizer with lower learning rate for fine-tuning
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=cfg.learning_rate * 0.1,  # 10x lower LR
    weight_decay=cfg.weight_decay
)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=cfg.num_epochs_finetune
)

print("\nStarting fine-tuning on real data...\n")

best_test_loss = float('inf')

for epoch in range(cfg.num_epochs_finetune):
    print(f"\nEpoch {epoch+1}/{cfg.num_epochs_finetune}")
    print("-" * 50)
    
    # Train on real data
    train_loss = train_epoch(model, real_train_loader, criterion, optimizer, device)
    
    # Test on held-out real data
    test_metrics = validate(model, real_test_loader, criterion, device, cfg.max_offset)
    
    scheduler.step()
    
    # Print metrics
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Test Loss:  {test_metrics['loss']:.4f}")
    print(f"Mean Error: {test_metrics['mean_error']:.2f}px")
    print(f"Median Error: {test_metrics['median_error']:.2f}px")
    print(f"Within 2px:  {test_metrics['pct_within_2px']*100:.1f}%")
    print(f"Within 5px:  {test_metrics['pct_within_5px']*100:.1f}%")
    print(f"Within 10px: {test_metrics['pct_within_10px']*100:.1f}%")
    
    # Save best model
    if test_metrics['loss'] < best_test_loss:
        best_test_loss = test_metrics['loss']
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'test_metrics': test_metrics
        }, os.path.join(cfg.checkpoint_dir, 'best_finetuned.pth'))
        print("✓ Saved best fine-tuned model")

print("\n" + "="*50)
print("Fine-tuning complete!")
print(f"Best test loss: {best_test_loss:.4f}")

Loaded best synthetic model
Dataset: 3930 data points from 120 images
Dataset: 909 data points from 30 images

Starting fine-tuning on real data...


Epoch 1/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.08it/s, loss=0.00219]
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.15it/s]


Train Loss: 0.0058
Test Loss:  0.0068
Mean Error: 17.93px
Median Error: 8.82px
Within 2px:  9.7%
Within 5px:  23.5%
Within 10px: 59.1%
✓ Saved best fine-tuned model

Epoch 2/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:29<00:00,  4.12it/s, loss=0.00662]
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.25it/s]


Train Loss: 0.0050
Test Loss:  0.0067
Mean Error: 17.93px
Median Error: 9.55px
Within 2px:  8.3%
Within 5px:  23.4%
Within 10px: 59.5%
✓ Saved best fine-tuned model

Epoch 3/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:29<00:00,  4.12it/s, loss=0.0141]  
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.33it/s]


Train Loss: 0.0047
Test Loss:  0.0068
Mean Error: 18.33px
Median Error: 9.16px
Within 2px:  8.3%
Within 5px:  24.7%
Within 10px: 58.4%

Epoch 4/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.10it/s, loss=0.00214] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.21it/s]


Train Loss: 0.0044
Test Loss:  0.0068
Mean Error: 18.01px
Median Error: 9.40px
Within 2px:  8.5%
Within 5px:  25.3%
Within 10px: 59.2%

Epoch 5/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.10it/s, loss=0.00245] 
Validation: 100%|██████████| 29/29 [00:07<00:00,  4.06it/s]


Train Loss: 0.0042
Test Loss:  0.0067
Mean Error: 17.39px
Median Error: 9.41px
Within 2px:  8.9%
Within 5px:  26.4%
Within 10px: 62.7%

Epoch 6/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.10it/s, loss=0.00137]
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.34it/s]


Train Loss: 0.0041
Test Loss:  0.0066
Mean Error: 17.10px
Median Error: 9.00px
Within 2px:  10.0%
Within 5px:  27.2%
Within 10px: 62.2%
✓ Saved best fine-tuned model

Epoch 7/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:29<00:00,  4.13it/s, loss=0.000738]
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.26it/s]


Train Loss: 0.0040
Test Loss:  0.0064
Mean Error: 16.58px
Median Error: 8.76px
Within 2px:  11.8%
Within 5px:  28.8%
Within 10px: 64.5%
✓ Saved best fine-tuned model

Epoch 8/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.10it/s, loss=0.00109] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.23it/s]


Train Loss: 0.0039
Test Loss:  0.0069
Mean Error: 16.99px
Median Error: 8.43px
Within 2px:  11.8%
Within 5px:  30.4%
Within 10px: 67.5%

Epoch 9/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:29<00:00,  4.11it/s, loss=0.00224] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.32it/s]


Train Loss: 0.0037
Test Loss:  0.0063
Mean Error: 16.44px
Median Error: 8.61px
Within 2px:  11.1%
Within 5px:  29.5%
Within 10px: 64.7%
✓ Saved best fine-tuned model

Epoch 10/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.09it/s, loss=0.00551] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.17it/s]


Train Loss: 0.0037
Test Loss:  0.0064
Mean Error: 16.69px
Median Error: 9.04px
Within 2px:  10.1%
Within 5px:  28.6%
Within 10px: 63.2%

Epoch 11/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:29<00:00,  4.12it/s, loss=0.00302] 
Validation: 100%|██████████| 29/29 [00:07<00:00,  4.11it/s]


Train Loss: 0.0035
Test Loss:  0.0065
Mean Error: 16.38px
Median Error: 8.38px
Within 2px:  12.2%
Within 5px:  31.4%
Within 10px: 66.3%

Epoch 12/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.08it/s, loss=0.00536] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.15it/s]


Train Loss: 0.0034
Test Loss:  0.0064
Mean Error: 16.48px
Median Error: 8.67px
Within 2px:  11.2%
Within 5px:  29.6%
Within 10px: 64.8%

Epoch 13/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.01it/s, loss=0.0012]  
Validation: 100%|██████████| 29/29 [00:07<00:00,  4.12it/s]


Train Loss: 0.0034
Test Loss:  0.0063
Mean Error: 16.56px
Median Error: 8.91px
Within 2px:  10.8%
Within 5px:  28.5%
Within 10px: 64.0%

Epoch 14/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.08it/s, loss=0.00203] 
Validation: 100%|██████████| 29/29 [00:07<00:00,  4.05it/s]


Train Loss: 0.0033
Test Loss:  0.0063
Mean Error: 16.49px
Median Error: 8.77px
Within 2px:  11.1%
Within 5px:  27.9%
Within 10px: 63.8%
✓ Saved best fine-tuned model

Epoch 15/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.09it/s, loss=0.00358] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.14it/s]


Train Loss: 0.0033
Test Loss:  0.0064
Mean Error: 16.28px
Median Error: 8.19px
Within 2px:  12.0%
Within 5px:  30.3%
Within 10px: 66.0%

Epoch 16/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.10it/s, loss=0.00247] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.16it/s]


Train Loss: 0.0032
Test Loss:  0.0064
Mean Error: 16.74px
Median Error: 8.83px
Within 2px:  11.0%
Within 5px:  28.3%
Within 10px: 63.8%

Epoch 17/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.07it/s, loss=0.000859]
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.23it/s]


Train Loss: 0.0032
Test Loss:  0.0064
Mean Error: 16.57px
Median Error: 8.81px
Within 2px:  11.1%
Within 5px:  28.3%
Within 10px: 64.6%

Epoch 18/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.02it/s, loss=0.00135] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.19it/s]


Train Loss: 0.0034
Test Loss:  0.0063
Mean Error: 16.28px
Median Error: 8.20px
Within 2px:  11.8%
Within 5px:  30.6%
Within 10px: 66.2%

Epoch 19/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.09it/s, loss=0.00166] 
Validation: 100%|██████████| 29/29 [00:07<00:00,  4.13it/s]


Train Loss: 0.0032
Test Loss:  0.0064
Mean Error: 16.59px
Median Error: 8.84px
Within 2px:  11.7%
Within 5px:  28.3%
Within 10px: 63.6%

Epoch 20/20
--------------------------------------------------


Training: 100%|██████████| 123/123 [00:30<00:00,  4.07it/s, loss=0.00174] 
Validation: 100%|██████████| 29/29 [00:06<00:00,  4.23it/s]

Train Loss: 0.0032
Test Loss:  0.0063
Mean Error: 16.38px
Median Error: 8.44px
Within 2px:  11.8%
Within 5px:  29.5%
Within 10px: 65.1%

Fine-tuning complete!
Best test loss: 0.0063





## 9. Final Evaluation

In [9]:
# Load best fine-tuned model
checkpoint = torch.load(os.path.join(cfg.checkpoint_dir, 'best_finetuned.pth'))
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print("="*60)
print("FINAL EVALUATION ON REAL TEST SET")
print("="*60)

test_metrics = validate(model, real_test_loader, criterion, device, cfg.max_offset)

print(f"\nTest Set Performance:")
print(f"  Mean Error:    {test_metrics['mean_error']:.2f}px")
print(f"  Median Error:  {test_metrics['median_error']:.2f}px")
print(f"\nAccuracy:")
print(f"  Within 2px:  {test_metrics['pct_within_2px']*100:.1f}%")
print(f"  Within 5px:  {test_metrics['pct_within_5px']*100:.1f}%")
print(f"  Within 10px: {test_metrics['pct_within_10px']*100:.1f}%")

print(f"\nComparison to CV Baseline:")
print(f"  CV baseline (real): 28% @ 2px, 38-42% @ 5px, 60% @ 10px")
print(f"  ML model (real):    {test_metrics['pct_within_2px']*100:.1f}% @ 2px, {test_metrics['pct_within_5px']*100:.1f}% @ 5px, {test_metrics['pct_within_10px']*100:.1f}% @ 10px")
print(f"\n  Improvement: +{(test_metrics['pct_within_5px']-0.40)*100:.1f}% @ 5px")

FINAL EVALUATION ON REAL TEST SET


Validation: 100%|██████████| 29/29 [00:06<00:00,  4.39it/s]


Test Set Performance:
  Mean Error:    16.49px
  Median Error:  8.77px

Accuracy:
  Within 2px:  11.1%
  Within 5px:  27.9%
  Within 10px: 63.8%

Comparison to CV Baseline:
  CV baseline (real): 28% @ 2px, 38-42% @ 5px, 60% @ 10px
  ML model (real):    11.1% @ 2px, 27.9% @ 5px, 63.8% @ 10px

  Improvement: +-12.1% @ 5px





## 10. Inference Pipeline

In [10]:
def detect_error_bars_ml(
    image_path: str,
    data_points: List[Dict[str, Any]],
    model: nn.Module,
    cfg: Config,
    device: torch.device
) -> Tuple[List[Dict[str, Any]], float]:
    """
    Detect error bar endpoints using ML model.
    
    Args:
        image_path: Path to image
        data_points: List of {"lineName": str, "points": [{"x": float, "y": float}]}
        model: Trained model
        cfg: Configuration
        device: torch device
    
    Returns:
        error_bars: Detection results
        avg_conf: Average confidence (dummy for now)
    """
    model.eval()
    
    # Load image
    img = Image.open(image_path).convert('RGB')
    img_w, img_h = img.size
    
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ])
    
    error_bars = []
    
    with torch.no_grad():
        for line in data_points:
            line_name = line.get("lineName", "line")
            points_out = []
            
            for point in line.get("points", []):
                x, y = int(point["x"]), int(point["y"])
                
                # Extract crop
                x1 = max(0, x - cfg.crop_width // 2)
                x2 = min(img_w, x + cfg.crop_width // 2)
                y1 = max(0, y - cfg.crop_height // 2)
                y2 = min(img_h, y + cfg.crop_height // 2)
                
                crop = img.crop((x1, y1, x2, y2))
                
                # Pad if needed
                if crop.size != (cfg.crop_width, cfg.crop_height):
                    padded = Image.new('RGB', (cfg.crop_width, cfg.crop_height), (255, 255, 255))
                    paste_x = (cfg.crop_width - crop.size[0]) // 2
                    paste_y = (cfg.crop_height - crop.size[1]) // 2
                    padded.paste(crop, (paste_x, paste_y))
                    crop = padded
                
                # Transform and predict
                crop_tensor = transform(crop).unsqueeze(0).to(device)
                offsets = model(crop_tensor).cpu().numpy()[0]
                
                # Denormalize
                dy_up = offsets[0] * cfg.max_offset
                dy_down = offsets[1] * cfg.max_offset
                
                # Compute endpoints
                y_up = int(round(y - dy_up))
                y_down = int(round(y + dy_down))
                
                # Clamp to image bounds
                y_up = max(0, min(img_h - 1, y_up))
                y_down = max(0, min(img_h - 1, y_down))
                
                points_out.append({
                    "data_point": {"x": float(point["x"]), "y": float(point["y"])},
                    "upper_error_bar": {"x": float(point["x"]), "y": float(y_up)},
                    "lower_error_bar": {"x": float(point["x"]), "y": float(y_down)},
                    "confidence": 0.95  # Placeholder
                })
            
            error_bars.append({"lineName": line_name, "points": points_out})
    
    return error_bars, 0.95

print("Inference function defined")

Inference function defined


## 11. Generate Output JSONs

In [11]:
def generate_ml_outputs(
    image_ids: List[str],
    images_dir: str,
    labels_dir: str,
    output_dir: str,
    model: nn.Module,
    cfg: Config,
    device: torch.device
):
    """
    Generate output JSONs for all images using ML model.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Generating outputs for {len(image_ids)} images...")
    
    for img_id in tqdm(image_ids):
        img_path = os.path.join(images_dir, f"{img_id}.png")
        label_path = os.path.join(labels_dir, f"{img_id}.json")
        output_path = os.path.join(output_dir, f"{img_id}.json")
        
        if not os.path.exists(label_path):
            continue
        
        # Load label
        with open(label_path, 'r') as f:
            label = json.load(f)
        
        # Convert to input format
        data_points = []
        for line in label:
            line_name = line["label"]["lineName"]
            points = [
                {"x": float(p["x"]), "y": float(p["y"])}
                for p in line["points"]
                if p.get("label", "") == ""
            ]
            if points:
                data_points.append({"lineName": line_name, "points": points})
        
        # Run detection
        error_bars, _ = detect_error_bars_ml(img_path, data_points, model, cfg, device)
        
        # Save output
        output_data = {
            "image_file": f"{img_id}.png",
            "error_bars": error_bars
        }
        
        with open(output_path, 'w') as f:
            json.dump(output_data, f, indent=2)
    
    print(f"✓ Generated {len(image_ids)} output files in {output_dir}")

# Generate for real test set
generate_ml_outputs(
    real_test_ids,
    cfg.real_images,
    cfg.real_labels,
    "outputs_ml_real",
    model,
    cfg,
    device
)

Generating outputs for 30 images...


100%|██████████| 30/30 [00:03<00:00,  8.21it/s]

✓ Generated 30 output files in outputs_ml_real





In [12]:
!zip "ouputs_ml_real.zip" "/kaggle/working/outputs_ml_real"

  adding: kaggle/working/outputs_ml_real/ (stored 0%)
