In [None]:
    !git clone https://github.com/facebookresearch/dinov3.git
    %cd dinov3

Cloning into 'dinov3'...
remote: Enumerating objects: 503, done.[K
remote: Counting objects: 100% (290/290), done.[K
remote: Compressing objects: 100% (197/197), done.[K
remote: Total 503 (delta 173), reused 103 (delta 93), pack-reused 213 (from 2)[K
Receiving objects: 100% (503/503), 9.87 MiB | 18.45 MiB/s, done.
Resolving deltas: 100% (205/205), done.
/content/dinov3


In [None]:
!pip install torchmetrics
!pip install termcolor
!pip install datasets huggingface_hub
!pip install -q datasets fiftyone torch torchvision pillow



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
#from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
import numpy as np
from PIL import Image
import fiftyone as fo
import torchvision.models as models
from datasets import load_dataset

In [None]:
dataset = fo.Dataset.from_dir(
    dataset_dir="/content/drive/MyDrive/Assignment3/CarDD",
    dataset_type=fo.types.FiftyOneDataset
)

print(len(dataset), "samples loaded")
print(dataset.first())

Importing samples...


INFO:fiftyone.utils.data.importers:Importing samples...


 100% |███████████████| 2816/2816 [114.9ms elapsed, 0s remaining, 25.0K samples/s] 


INFO:eta.core.utils: 100% |███████████████| 2816/2816 [114.9ms elapsed, 0s remaining, 25.0K samples/s] 


2816 samples loaded
<Sample: {
    'id': '686be771e1d7135d782c77e9',
    'media_type': 'image',
    'filepath': '/content/drive/MyDrive/Assignment3/CarDD/data/000001.jpg',
    'tags': [],
    'metadata': <ImageMetadata: {
        'size_bytes': None,
        'mime_type': None,
        'width': 1000,
        'height': 750,
        'num_channels': None,
    }>,
    'created_at': datetime.datetime(2025, 11, 2, 6, 6, 13, 129000),
    'last_modified_at': datetime.datetime(2025, 11, 2, 6, 6, 13, 129000),
    'detections': <Detections: {
        'detections': [
            <Detection: {
                'id': '686be771e1d7135d782c77e5',
                'attributes': {},
                'tags': [],
                'label': 'scratch',
                'bounding_box': [0.16704, 0.05361333333333333, 0.20279, 0.17512],
                'mask': None,
                'mask_path': None,
                'confidence': None,
                'index': None,
                'iscrowd': 0,
                'occlu

In [None]:
REPO_DIR = '/content/dinov3'
weights = '/content/dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth'
dino =  torch.hub.load(REPO_DIR, 'dinov3_vitb16', source='local', weights=weights)

In [None]:
#this class takes the Fiftyone dataset and gets it ready to be used in the deep learning model
class CarDD_Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, image_size=(224, 224)): #left the default image size
        self.samples = list(dataset)
        self.image_size = image_size
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            #these values I used to normalized from large imagenet dataset
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        image = Image.open(sample.filepath).convert("RGB") #just to make sure the image is RBG formate
        mask = sample.segmentations.detections[0].mask #just going to get the scratch mask
        # converts the numpy array mask to a PIL Image
        mask_img = Image.fromarray(mask.astype(np.uint8) * 255, mode='L')
        # then resize the image
        mask_img = mask_img.resize(self.image_size, Image.NEAREST)

        # make sure its a tensor and its normalized
        mask_tensor = torch.from_numpy(np.array(mask_img)).float() / 255.0
        mask_tensor = mask_tensor.unsqueeze(0)  #this adds a channel

        return self.transform(image), mask_tensor


class SegmentationHead(nn.Module):
    #this is just a standard neural network bacbone
    def __init__(self, in_channels=768, num_classes=1):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.Conv2d(in_channels, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, num_classes, kernel_size=1)
        )

    def forward(self, x):
        return self.decoder(x)


class DINOv3_Model(nn.Module):
    #here we use the dino_pretrained model and the neural network class defined above
    def __init__(self, backbone, head):
        super().__init__()
        self.backbone = backbone
        self.head = head

    def forward(self, x):
        #the dino method that extracts features from a layer, this case I chose the last one
        features = self.backbone.get_intermediate_layers(x, n=1)[0]
        # get the dino outputs batch size, number of token and channel dimenstions
        batch_size, number_tokens, channel_dimension = features.shape
        height = width = int(number_tokens ** 0.5) #height and widght of patch gird
        features = features.transpose(1, 2).reshape(batch_size, channel_dimension, height, width)
        mask = self.head(features) #the decorder network
        mask = F.interpolate(mask, size=x.shape[2:], mode='bilinear', align_corners=False)
        return mask


# splitting the data into training and validation sets
total_samples = len(dataset)
train_size = int(0.8 * total_samples)
val_size = total_samples - train_size

train_dataset_fo = dataset.take(train_size)
val_dataset_fo = dataset.skip(train_size)

print(f"Training samples: {len(train_dataset_fo)}")
print(f"Validation samples: {len(val_dataset_fo)}")

# PyTorch datasets
train_dataset = CarDD_Dataset(train_dataset_fo)
val_dataset = CarDD_Dataset(val_dataset_fo)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

#the dino model
model = DINOv3_Model(dino, SegmentationHead()).cuda()

# this is so the backbone parameters is frozen
print("Freezing backbone parameters...")
for param in model.backbone.parameters():
    param.requires_grad = False

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters: {total_params:,}")

# using logits loss and standard Adam optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.head.parameters(), lr=1e-4)

#num_epochs = 500
num_epochs = 50 #had to use a way smaller epoch number
best_val_loss = float('inf')

# this function will get the bounding box from the segmentation mask
def mask_to_bbox(mask):
    mask_binary = (mask > 0.5).cpu().numpy().squeeze()
    coords = np.argwhere(mask_binary)
    if len(coords) == 0:
        return None
    y_min, x_min = coords.min(axis=0)
    y_max, x_max = coords.max(axis=0)
    return [x_min, y_min, x_max, y_max]

# standard training loops
print("\nStarting training..")
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for images, masks in train_loader:
        images, masks = images.cuda(), masks.cuda()

        # the forward pass
        preds = model(images)
        loss = criterion(preds, masks)

        #backpopagration
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # validation
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for images, masks in val_loader:
            images, masks = images.cuda(), masks.cuda()
            preds = model(images)
            loss = criterion(preds, masks)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    # printing the progess every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # as the validation loss gets smaller, we save the best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
        }, '/content/drive/MyDrive/Assignment3/best_segmentation_model.pth')
        print(f"   Best model saved! Val Loss: {avg_val_loss:.4f}")

    # also saving the model after every 10 epochs
    if (epoch + 1) % 10 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
        }, f'/content/drive/MyDrive/Assignment3/checkpoint_epoch_{epoch+1}.pth')
        print(f"   Checkpoint saved at epoch {epoch+1}")

print("\nTraining completed!")
print(f"Best validation loss: {best_val_loss:.4f}")

Training samples: 2252
Validation samples: 564
Freezing backbone parameters...
Trainable parameters: 1,934,337
Total parameters: 87,603,969

Starting training...


  mask_img = Image.fromarray(mask.astype(np.uint8) * 255, mode='L')


  → Best model saved! Val Loss: 0.5216
  → Best model saved! Val Loss: 0.4843
  → Best model saved! Val Loss: 0.4675
  → Best model saved! Val Loss: 0.4588
  → Best model saved! Val Loss: 0.4516
  → Best model saved! Val Loss: 0.4457
  → Best model saved! Val Loss: 0.4385
  → Best model saved! Val Loss: 0.4295
  → Best model saved! Val Loss: 0.4294
Epoch 10/50 - Train Loss: 0.4310, Val Loss: 0.4212
  → Best model saved! Val Loss: 0.4212
  → Checkpoint saved at epoch 10
  → Best model saved! Val Loss: 0.4186
  → Best model saved! Val Loss: 0.4044
  → Best model saved! Val Loss: 0.3984
  → Best model saved! Val Loss: 0.3965
  → Best model saved! Val Loss: 0.3849
  → Best model saved! Val Loss: 0.3814
  → Best model saved! Val Loss: 0.3736
  → Best model saved! Val Loss: 0.3713
Epoch 20/50 - Train Loss: 0.3420, Val Loss: 0.3549
  → Best model saved! Val Loss: 0.3549
  → Checkpoint saved at epoch 20
  → Best model saved! Val Loss: 0.3541
  → Best model saved! Val Loss: 0.3455
  → Best mode