Contrastive Learning
---------------------------------------------------
This script implements training via a Siamese Network.

- Training Logic: The model processes pairs of images with a binary label (0/1).

- Objective (Loss): Uses `CosineEmbeddingLoss`.
  Maximizes cosine similarity for positive pairs and minimizes it for negative ones.

- Data Management: The `DatasetContrastive` class has two distinct features:
  1. Supports XML parsing to extract bounding boxes and crop objects.
  2. Enforces a 50/50 balance between positive and negative pairs to avoid statistical bias.

LogoResNet50
----------------------------------
`LogoResNet50` class:
- Backbone: Pre-trained ResNet50.
- Head: Replaces the classifier with a linear layer for 128-dimensional embedding output.
- Fine-tuning: `freeze_numer_of_layer` method for progressive block freezing.
"""

In [None]:
import torch

class Config:
    # 1. SETUP
    project_name = "FewShot"
    
    # Paths for saving results and checkpoints
    logs_dir = "./logs"
    checkpoints_dir = "./checkpoints"
    
    # Device configuration
    if torch.backends.mps.is_available():
     device = "mps"
    elif torch.cuda.is_available():
     device = "cuda"
    else:
     device = "cpu"
    seed = 42  # For reproducibility

    # 2. DATASET PATH
    dataset_root = "LogoDet-3K/LogoDet-3K-divided"
    csv_index_path = "LogoDet-3K"

    # Split Ratios: 70% Train, 20% Validation 
    train_split_ratio = 0.7
    val_split_ratio = 0.2

    # 3. TRAINING HYPERPARAMETERS
    epochs = 20
    batch_size = 8
    learning_rate = 1e-5

    # 4. MODEL ARCHITECTURE
    backbone = "resnet50" 
    pretrained = True     
    embedding_dim = 128    

    # TRAINED MODEL PATH
    trained_model_path = ""

    # Prediciton threadshold used to decide if two logos are the same during inference
    prediciton_threashold = 0.5
 
    
   

    freeze_layers = 0
    # Transfer Learning Strategy
    freeze_early_layers = True
    # Unfreeze all layers after this specific epoch for fine-tuning
    unfreeze_at_epoch = 5

    # 5. LOSS FUNCTION
    margin = 0.2           # Minimal distance between different logos 


In [None]:
import os
import sys
import ssl
import random
import torch.nn as nn
ssl._create_default_https_context = ssl._create_unverified_context
import glob
import xml.etree.ElementTree as ET
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim

from torchvision import transforms
import torchvision.models as models
from torchvision.models import ResNet50_Weights

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from collections import defaultdict

SEED = 101
random.seed(Config.seed)
torch.manual_seed(Config.seed)

Custom PyTorch Dataset for Contrastive Learning with Bounding Box support (XML).

Key operations:
1. XML Parsing: Loads images and extracts object bounding boxes from Pascal VOC-style XML files.
2. Coordinate Scaling: Adjusts bounding box coordinates to align with the resized/transformed image.
3. Pair Generation: Returns a pair of images (Anchor + Second Image) and a binary label (0/1).
   - Uses a forced 50/50 probability for Positive/Negative pairs.
   - This balance is crucial: using the natural distribution would result in ~99% negatives, causing the model to simply learn to predict "different" for everything.

In [None]:
class DatasetContrastive(Dataset):
    def __init__(self, file_list, transform=None):
      self.file_list = file_list
      self.transform = transform

      self.label_to_indices = defaultdict(list)
      for idx, img_path in enumerate(self.file_list):
        # Extract label from path: LogoDet-3K\LogoDet-3K\Clothes\panerai\21.jpg
        # Label is the second-to-last part of the path
        label = img_path.replace('\\', '/').split('/')[-2]
        self.label_to_indices[label].append(idx)

    def __len__(self):
        self.filelength =len(self.file_list)
        return self.filelength

    def load_image(self, image_path):
        xml_path = image_path.replace(".jpg", ".xml")
        img = Image.open(image_path)
        orig_w, orig_h = img.size
        img_transformed = self.transform(img)

        labels_list = []
        bb_list = []

        try:
          tree = ET.parse(xml_path)
          root = tree.getroot()
        except Exception as e:
          raise Exception(f"Failed to parse XML file: {xml_path} | Error: {e}")
        
        objects = root.findall("object")

        for obj in objects:
          label = obj.find("name").text
          bbox = obj.find("bndbox")
          xmin = int(bbox.find("xmin").text)
          ymin = int(bbox.find("ymin").text)
          xmax = int(bbox.find("xmax").text)
          ymax = int(bbox.find("ymax").text)

          # Scale bounding boxes to match the resized image
          new_w, new_h = img_transformed.shape[2], img_transformed.shape[1]
          x_scale = new_w / orig_w
          y_scale = new_h / orig_h

          bbox_scaled = {
              "xmin": int(xmin * x_scale),
              "ymin": int(ymin * y_scale),
              "xmax": int(xmax * x_scale),
              "ymax": int(ymax * y_scale)
          }
        
        # ERRORE DI INDENTAZIONE E IDX LABELS DA INTEGRARE
        labels_list.append(label)
        bb_list.append(bbox_scaled)

        
        return {"image": img_transformed, "labels": labels_list, "bbs": bb_list}

    def __getitem__(self, idx):
      img_path = self.file_list[idx]
      label = img_path.replace('\\', '/').split('/')[-2]

      img1 = self.load_image(img_path)

      # decide if pair is positive or negative
      # NON SO SE è MEGLIO UN 50/50 O SEE è MEGLIO LA DISTRIBUZIONE NATURALE DEL DATASE.
      # CON LA DISTRIBUZIONE NATURALE è MOLTO PROBABILE CHE SOLO IMMAGINI NEGATIVE VENGANO ESTRATTE CHE NON CREDO SIA UN BENE PER IL TRAINING.
      is_positive_pair = random.choice([0, 1])

      if is_positive_pair:
          # sample positive
          pos_indices = [i for i in self.label_to_indices[label] if i != idx]
          if len(pos_indices) == 0:
              print("ERROR LOADING A POSITIVE MATCH FOR THE LOADED IMAGE")
              exit()
          else:
              idx2 = random.choice(pos_indices)
              img2_path = self.file_list[idx2]
              img2 = self.load_image(img2_path)
      else:
          # sample negative
          neg_label = random.choice([l for l in self.label_to_indices.keys() if l != label])
          idx2 = random.choice(self.label_to_indices[neg_label])
          img2_path = self.file_list[idx2]
          img2 = self.load_image(img2_path)

      # is_positive_pair is returned for quick access so you dont have to compare labels after loading the images
      return img1, img2, is_positive_pair



Splits the dataset into training and validation sets at the brand level, ensuring no class overlap.

Key operations:
1. Brand Separation: Divides brand folders into train/val subsets based on `val_split` using a fixed seed to ensure reproducibility.
2. Adaptive Downsampling: If `total_set_size` is enforced, calculates the quota of images per brand. If this falls below `min_images_per_brand`, it reduces the number of participating brands to ensure the remaining ones meet the minimum image count.
3. Image Collection: Randomly samples the calculated number of images for each selected brand, or retrieves all images if no total size limit is set.

In [None]:
def getTrainValPaths(root_dir, val_split, total_set_size=None, min_images_per_brand=2):
    train_val_path = os.path.join(root_dir, 'train_val')
    train_val_brands = []

    # Collect brand folders
    if not os.path.exists(train_val_path):
        print(f"Warning: {train_val_path} not found.")
        return [], []

    for category in os.listdir(train_val_path):
        cat_path = os.path.join(train_val_path, category)
        if os.path.isdir(cat_path):
            for brand in os.listdir(cat_path):
                brand_full_path = os.path.join(cat_path, brand)
                if os.path.isdir(brand_full_path):
                    train_val_brands.append(brand_full_path)

    # Split brands into Train and Val
    val_size = int(len(train_val_brands) * val_split)
    train_size = len(train_val_brands) - val_size
    generator = torch.Generator().manual_seed(Config.seed)
    train_subset, val_subset = random_split(train_val_brands, [train_size, val_size], generator=generator)
    
    train_brand_list = [train_val_brands[i] for i in train_subset.indices]
    val_brand_list = [train_val_brands[i] for i in val_subset.indices]

    train_data_list = []
    val_data_list = []

    # Sampling Logic
    if total_set_size is not None:
        images_per_brand = round(total_set_size / len(train_val_brands))
        
        if images_per_brand < min_images_per_brand:
            print(f"Not enough images per brand ({images_per_brand}), downscaling brand sets to ensure {min_images_per_brand} images/brand.")
            
            # Calculate how many brands we can actually afford
            new_total_brand_count = round(total_set_size / min_images_per_brand)
            new_val_size = round(new_total_brand_count * val_split)
            new_train_size = new_total_brand_count - new_val_size

            train_brand_list = random.sample(train_brand_list, min(len(train_brand_list), new_train_size))
            val_brand_list = random.sample(val_brand_list, min(len(val_brand_list), new_val_size))
            images_per_brand = min_images_per_brand

        for brand in train_brand_list:
            imgs = glob.glob(os.path.join(brand, '*.jpg'))

            if len(imgs) < min_images_per_brand:
                print(f"images are less than {min_images_per_brand} for this brand: {brand} in the TRAIN set")

            train_data_list.extend(random.sample(imgs, min(images_per_brand, len(imgs))))
            
        for brand in val_brand_list:
            imgs = glob.glob(os.path.join(brand, '*.jpg'))

            if len(imgs) < min_images_per_brand:
                print(f"images are less than {min_images_per_brand} for this brand: {brand} in the VALIDATION set")
            
            val_data_list.extend(random.sample(imgs, min(images_per_brand, len(imgs))))
    else:
        for brand in train_brand_list:
            train_data_list.extend(glob.glob(os.path.join(brand, '*.jpg')))
        for brand in val_brand_list:
            val_data_list.extend(glob.glob(os.path.join(brand, '*.jpg')))

    return train_data_list, val_data_list




ResNet50-based architecture modified for Metric Learning (generating embeddings).

Key operations:
1. Backbone Initialization: Loads a standard ResNet50 (optionally with ImageNet weights).
2. Head Replacement: Swaps the original 1000-class classifier with a linear projection layer to output embeddings of size `embedding_dim`.
3. Progressive Freezing: Implements a custom `freeze_numer_of_layer` method to selectively freeze backbone blocks (from shallow 'conv1' to deep 'layer4') for controlled fine-tuning.

In [None]:

class LogoResNet50(nn.Module):
    def __init__(self, embedding_dim=128, pretrained=True, num_of_freeze_layer=5, activation_fn=None):
        super(LogoResNet50, self).__init__()
        
        # 1. Load Pre-trained Weights
        # Initialize the model with weights pretrained on ImageNet for transfer learning
        if pretrained:
            weights = ResNet50_Weights.DEFAULT
            self.model = models.resnet50(weights=weights)
        else:
            self.model = models.resnet50(weights=None)
            
        # 2. Modify the Head (Fully Connected Layer)
        # We need to produce feature embeddings instead of class probabilities
        input_features_fc = self.model.fc.in_features # Typically 2048 for ResNet50
        
        head_layers = []
        # Project features to the desired embedding dimension (e.g., 128)
        head_layers.append(nn.Linear(input_features_fc, embedding_dim))
        
        # Add an optional activation function if provided
        if activation_fn is not None:
            head_layers.append(activation_fn)
        
        # Replace the original classifier with our custom embedding head
        self.model.fc = nn.Sequential(*head_layers)

        # 3. Freezing Management
        # Define the blocks here to access them in the freeze method.
        # This structure allows progressive freezing/unfreezing strategies
        self.blocks = [
            ['conv1', 'bn1'],   # Level 1
            ['layer1'],         # Level 2
            ['layer2'],         # Level 3
            ['layer3'],         # Level 4
            ['layer4'],         # Level 5: Entire backbone frozen
        ]

        # Apply the initial freezing configuration
        self.freeze_numer_of_layer(num_of_freeze_layer)
        
    def forward(self, x):
        return self.model(x)
    
    def freeze_numer_of_layer(self, num_of_freeze_layer):
        """
        Manages layer freezing for transfer learning strategies.
        
        Args:
            num_of_freeze_layer (int):
              0   -> All layers unlocked (Full Fine-Tuning)
              1-5 -> Progressively freezes the backbone layers from shallow to deep
        """
        
        # STEP 1: RESET. Unfreeze everything (requires_grad = True).
        # This ensures we start from a clean state before applying new constraints.
        for param in self.model.parameters():
            param.requires_grad = True

        # If num is 0, exit immediately (Full Fine-Tuning mode)
        if num_of_freeze_layer == 0:
            print("Configuration: Full Fine-Tuning (All layers are trainable)")
            return
        
        # Safety check to avoid index out of bounds
        limit = min(num_of_freeze_layer, len(self.blocks))
        
        frozen_list = []

        # STEP 2: Progressively freeze the requested blocks
        for i in range(limit):
            current_blocks = self.blocks[i]
            for block_name in current_blocks:
                # Retrieve the layer by name
                layer = getattr(self.model, block_name)
                
                # Freeze parameters for this specific block
                for param in layer.parameters():
                    param.requires_grad = False
                
                frozen_list.append(block_name)

        print(f"Freezing Level {limit}. Frozen blocks: {frozen_list}")



Main training script for a Siamese Network using Contrastive Learning.

Key operations:
1.  **Environment Setup**: Configures system paths to import custom modules (`configs`, `PY_script`) and handles SSL contexts for downloading pretrained weights.
2.  **Data Preparation**: 
    -   Splits the dataset into training and validation sets using `getTrainValPaths`.
    -   Applies **Data Augmentation** (RandomResizedCrop, Flip, ColorJitter) to the training set to improve model robustness.
    -   Uses standard resizing and normalization for validation.
3.  **Model Initialization**: Loads the `LogoResNet50` model (likely a ResNet50 backbone modified for embeddings), moving it to the configured device (GPU/CPU).
4.  **Optimization Setup**: 
    -   **Loss**: Uses `CosineEmbeddingLoss`, which minimizes the distance between positive pairs and maximizes the distance between negative pairs.
    -   **Optimizer**: Uses Adam to update model weights.
5.  **Training Loop**:
    -   Iterates through epochs and batches.
    -   Extracts embeddings for `img1` and `img2`.
    -   **Label Conversion**: Converts binary labels (0/1) into the format required by CosineEmbeddingLoss (1 for similar, -1 for dissimilar).
    -   Performs backpropagation and updates weights.
6.  **Validation Loop**: Evaluates the model on the validation set without computing gradients (`torch.no_grad`) to monitor generalization performance.
7.  **Checkpointing**: Saves the model state dictionary every 5 epochs for future inference or resuming training.

In [None]:
print("Reading script train.py")




def main():
    print(f"Starting training for: {Config.project_name}")
    
    device = torch.device(Config.device)
    print(f"Using device: {device}")

    # 1. DATA AND AUGMENTATION 
    print("Loading dataset...")

    full_dataset_path = os.path.abspath(Config.dataset_root)
    print(f"Looking for dataset at: {full_dataset_path}")
    
    if not os.path.exists(Config.dataset_root):
        print(f"CRITICAL ERROR: The folder '{Config.dataset_root}' does not exist.")
        print("Verify that you have downloaded the dataset and that the path in Config.dataset_root is correct.")
        return # Exit the function to avoid the crash

        
    train_files, val_files = getTrainValPaths(
        Config.dataset_root, 
        val_split=Config.val_split_ratio,
        min_images_per_brand=2
    )

# SAFETY CHECK
    if len(train_files) == 0:
        print("CRITICAL ERROR: No training files found.")
        print(f"Check that '{Config.dataset_root}/train_val' contains the category and brand folders.")
        return # Exit the function to avoid the DataLoader error

    print(f"Training files found: {len(train_files)}")
    print(f"Validation files found: {len(val_files)}")

    print(f"Training files: {len(train_files)}")

    # Transformations
    transform = transforms.Compose([
        transforms.RandomResizedCrop(224, scale=(0.5, 1.0)), 
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    train_dataset = DatasetContrastive(train_files, transform=transform)
    train_loader = DataLoader(train_dataset, 
                          batch_size=Config.batch_size, 
                          shuffle=True, 
                          num_workers=2, 
                          persistent_workers=True)
    
    # 2. VALIDATION DATASET 
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    val_dataset = DatasetContrastive(val_files, transform=val_transform)
    val_loader = DataLoader(val_dataset, batch_size=Config.batch_size, shuffle=False, num_workers=2)

    # 3. MODEL 
    print("Inizialitation LogoResNet50")
    freeze_layers = getattr(Config, 'freeze_layers', 0)

    model = LogoResNet50(
        embedding_dim=Config.embedding_dim,
        pretrained=Config.pretrained,
        num_of_freeze_layer=freeze_layers 
    )
    
    model = model.to(device)

    # 4. LOSS E OPTIMIZER
    criterion = nn.CosineEmbeddingLoss(margin=Config.margin)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=Config.learning_rate)

    # 5. TRAINING LOOP 
    print("Starting training cycle")
    
    for epoch in range(Config.epochs):
        model.train()
        total_loss = 0
        
        for batch_idx, (img1, img2, label) in enumerate(train_loader):
            img1, img2, label = img1['image'].to(device), img2['image'].to(device), label.to(device)
            
            optimizer.zero_grad()
            
            out1 = model(img1)
            out2 = model(img2)
            
            target_label = label.float()
            target_label[target_label == 0] = -1
            
            loss = criterion(out1, out2, target_label)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            if batch_idx % 10 == 0:
                print(f"Epoch [{epoch+1}/{Config.epochs}] Batch {batch_idx} - Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} done! Average Loss: {avg_loss:.4f}")
        
        # STARTING VALIDATION
        model.eval() 
        val_loss = 0
        with torch.no_grad(): 
            for v_img1, v_img2, v_label in val_loader:
                v_img1, v_img2, v_label = v_img1['image'].to(device), v_img2['image'].to(device), v_label.to(device)
                
                v_out1 = model(v_img1)
                v_out2 = model(v_img2)
                
                v_target = v_label.float()
                v_target[v_target == 0] = -1
                
                loss = criterion(v_out1, v_out2, v_target)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        print(f"VALIDATION Epoch {epoch+1}: Loss = {avg_val_loss:.4f}")
        print("-" * 50)
        

        # Store checkpoint every 5 epochs
        if (epoch + 1) % 5 == 0:
            if not os.path.exists(Config.checkpoints_dir):
                os.makedirs(Config.checkpoints_dir)
            torch.save(model.state_dict(), f"{Config.checkpoints_dir}/model_epoch_{epoch+1}.pth")
            print("Checkpoint saved!")

if __name__ == "__main__":
    main()

: 