In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from matplotlib import pyplot as plt

import utils
import rdd_data_loader

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import rpn
from torchvision.models.detection import FasterRCNN
from torchvision.ops import MultiScaleRoIAlign

import torchvision.transforms.v2 as T_v2
from torchvision.transforms.v2 import functional as F

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# DinoV2 transformation

# Create a proper transform for object detection that transforms both images and targets
def get_transform():
    transforms = []
    
    # Add spatial transforms that will update bounding boxes accordingly
    transforms.append(T_v2.Resize(512))
    transforms.append(T_v2.CenterCrop(448))
    
    # Convert to tensors and normalize
    transforms.append(T_v2.ToTensor())
    transforms.append(T_v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
    
    return T_v2.Compose(transforms)


dino_transform = get_transform()

train_dataset = rdd_data_loader.RoadDamageDataset(csv_file='train_paths.csv', transforms=dino_transform)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=rdd_data_loader.collate_fn)



In [4]:
pbar = tqdm(enumerate(train_dataloader), total=1, desc=f"Epoch {10+1}")

for batch_idx, (images, targets) in pbar:
    images = [img.to(device) for img in images]
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

    print(targets)

    break


Epoch 11:   0%|          | 0/1 [00:01<?, ?it/s]

[{'boxes': tensor([[264., 600., 312., 672.],
        [345., 514., 379., 561.]], device='cuda:0'), 'labels': tensor([5, 5], device='cuda:0')}, {'boxes': tensor([[147., 188., 267., 327.]], device='cuda:0'), 'labels': tensor([2], device='cuda:0')}, {'boxes': tensor([[ 74., 325., 111., 591.]], device='cuda:0'), 'labels': tensor([5], device='cuda:0')}, {'boxes': tensor([[442., 527., 562., 552.],
        [ 42., 459., 251., 493.],
        [496., 611., 640., 640.]], device='cuda:0'), 'labels': tensor([1, 1, 1], device='cuda:0')}]





In [41]:
from torchvision.transforms import v2
from torchvision import tv_tensors
from PIL import Image
import numpy as np

# Load image as PIL Image first
path = "sample_scene_city.jpg"
img_pil = Image.open(path).convert("RGB")

# Get image dimensions for BoundingBoxes
img_width, img_height = img_pil.size  # Get dimensions from PIL Image

# Create BoundingBoxes with correct canvas size (height, width)
boxes = tv_tensors.BoundingBoxes(
    [[554, 720, 578, 740]],
    format="XYXY", 
    canvas_size=(img_height, img_width)  # Pass (H, W) not the array shape
)

# Define transforms
transforms = v2.Compose([
    v2.Resize(512),
    v2.CenterCrop(448),
    v2.ToTensor(),
    v2.Normalize(mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225])
])

print("Original boxes:", boxes)
print("Original image size:", img_pil.size)  # PIL image size is (W, H)

# Apply transforms
out_img, out_boxes = transforms(img_pil, boxes)

print("Transformed boxes:", out_boxes)
print("Transformed image shape:", out_img.shape)


Original boxes: BoundingBoxes([[554, 720, 578, 740]], format=BoundingBoxFormat.XYXY, canvas_size=(1125, 2000))
Original image size: (2000, 1125)
Transformed boxes: BoundingBoxes([[ 21, 295,  31, 304]], format=BoundingBoxFormat.XYXY, canvas_size=(448, 448))
Transformed image shape: torch.Size([3, 448, 448])
