UNZIP from Drive to /content/LogoDet-3k

In [1]:
import zipfile
import os
from google.colab import drive

def unzip_dataset():
    # 1. Mount Google Drive (if not already mounted)
    if not os.path.exists('/content/drive'):
        print("Mounting Google Drive...")
        drive.mount('/content/drive')

    # 2. Configuration
    # Source path (on Google Drive)
    source_path = "/content/drive/MyDrive/Colab Notebooks/LogoDet-3K.zip"

    # Destination path (Local VM - Ephemeral storage)
    # Using local storage is significantly faster for training than reading from Drive
    dest_path = "/content/LogoDet-3K"

    # 3. Verification
    if not os.path.exists(source_path):
        raise FileNotFoundError(f"The file was not found at: {source_path}")

    # 4. Extraction Routine
    print(f"Extracting '{source_path}' to '{dest_path}'...")

    # Create destination directory if it doesn't exist
    os.makedirs(dest_path, exist_ok=True)

    try:
        with zipfile.ZipFile(source_path, 'r') as zip_ref:
            # extractall handles the directory tree creation automatically
            zip_ref.extractall(dest_path)
        print(" Extraction complete successfully.")
        print(f"Files are located at: {dest_path}")

    except zipfile.BadZipFile:
        print(" Error: The file is a corrupted zip archive.")
    except Exception as e:
        print(f" An error occurred: {e}")

if __name__ == "__main__":
    unzip_dataset()

Extracting '/content/drive/MyDrive/Colab Notebooks/LogoDet-3K.zip' to '/content/LogoDet-3K'...
âœ… Extraction complete successfully.
Files are located at: /content/LogoDet-3K


Delete Duplicates

In [8]:
import os
import xml.etree.ElementTree as ET
import re
import shutil

def clean_duplicate_brand_folders(root_dir):
    """
    Keep only the first duplicate brand folder, rename it to the base brand name,
    and delete the rest. Fix XML labels in the kept folder to remove suffixes like -1, -2, etc.
    """
    for category in os.listdir(root_dir):
        category_path = os.path.join(root_dir, category)
        if not os.path.isdir(category_path):
            continue

        # Find brand folders with '-number' suffix
        brand_folders = [b for b in os.listdir(category_path) if os.path.isdir(os.path.join(category_path, b))]
        base_name_to_folders = {}

        for b in brand_folders:
            match = re.match(r"^(.*?)-\d+$", b)
            if match:
                base_name = match.group(1)
                if base_name not in base_name_to_folders:
                    base_name_to_folders[base_name] = []
                base_name_to_folders[base_name].append(b)

        # Process duplicates
        for base_name, folders in base_name_to_folders.items():
            if len(folders) < 1:
                continue  # no duplicates

            folders.sort()  # keep the first folder alphabetically
            folder_to_keep = folders[0]
            folders_to_delete = folders[1:]

            kept_folder_path = os.path.join(category_path, folder_to_keep)
            new_folder_path = os.path.join(category_path, base_name)

            print(f"Keeping folder: {folder_to_keep}, renaming to: {base_name}, deleting duplicates: {folders_to_delete}")

            # Fix XML labels in the kept folder
            for xml_file in os.listdir(kept_folder_path):
                if xml_file.endswith(".xml"):
                    xml_path = os.path.join(kept_folder_path, xml_file)
                    try:
                        tree = ET.parse(xml_path)
                        root = tree.getroot()
                        for obj in root.findall("object"):
                            name_elem = obj.find("name")
                            # Remove any -number suffix
                            name_elem.text = base_name
                        tree.write(xml_path)
                    except Exception as e:
                        print(f"Error fixing XML {xml_path}: {e}")

            # Rename the kept folder to the base brand name
            if kept_folder_path != new_folder_path:
                try:
                    os.rename(kept_folder_path, new_folder_path)
                    kept_folder_path = new_folder_path
                    print(f"Renamed folder: {folder_to_keep} -> {base_name}")
                except Exception as e:
                    print(f"Error renaming folder {kept_folder_path} -> {new_folder_path}: {e}")

            # Delete the other duplicate folders
            for folder in folders_to_delete:
                folder_path = os.path.join(category_path, folder)
                try:
                    shutil.rmtree(folder_path)
                    print(f"Deleted folder: {folder_path}")
                except Exception as e:
                    print(f"Error deleting folder {folder_path}: {e}")


if __name__ == "__main__":
    dataset_dir = "LogoDet-3K/LogoDet-3K"  # replace with your dataset root
    clean_duplicate_brand_folders(dataset_dir)
    print("Duplicate brand folders cleaned, renamed, and XML labels fixed.")



Duplicate brand folders cleaned, renamed, and XML labels fixed.


creazione PAN

In [6]:
########################################################################################################################################
# ALCUNI BRANDS SEMBRANO ESSERE DUPLICATI EX LogoDet-3K\LogoDet-3K\Leisure\stein world-1 E LogoDet-3K\LogoDet-3K\Leisure\stein world-2 #
########################################################################################################################################

import os
import glob
import math
import xml.etree.ElementTree as ET
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
import torch
from torchvision import transforms
import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from collections import defaultdict

SEED = 101



 # DATASET that takes the list of files for the dataset,
 # builds a dictionary with label -> image paths (label is taken from the image path, not from the xml because it is faster)
 # when an item from the dataset is requested it returns a triplet (anchor, positive, negative)

 # each sample has the structure: sample = {"image": img_transformed, "labels": labels_list, "bbs": bb_list}

class DatasetTriplet(torch.utils.data.Dataset):
    def __init__(self, file_list, transform=None):
      self.file_list = file_list
      self.transform = transform

      self.label_to_indices = defaultdict(list)
      for idx, img_path in enumerate(self.file_list):
        # Extract label from path: LogoDet-3K\LogoDet-3K\Clothes\panerai\21.jpg
        # Label is the second-to-last part of the path
        label = img_path.replace('\\', '/').split('/')[-2]
        self.label_to_indices[label].append(idx)

    def __len__(self):
        self.filelength =len(self.file_list)
        return self.filelength

    def load_image(self, image_path):
        xml_path = image_path.replace(".jpg", ".xml")
        img = Image.open(image_path)
        orig_w, orig_h = img.size
        img_transformed = self.transform(img)

        labels_list = []
        bb_list = []

        try:
          tree = ET.parse(xml_path)
          root = tree.getroot()
        except Exception as e:
          raise Exception(f"Failed to parse XML file: {xml_path} | Error: {e}")

        objects = root.findall("object")

        for obj in objects:
          label = obj.find("name").text
          bbox = obj.find("bndbox")
          xmin = int(bbox.find("xmin").text)
          ymin = int(bbox.find("ymin").text)
          xmax = int(bbox.find("xmax").text)
          ymax = int(bbox.find("ymax").text)

          # Scale bounding boxes to match the resized image
          new_w, new_h = img_transformed.shape[2], img_transformed.shape[1]
          x_scale = new_w / orig_w
          y_scale = new_h / orig_h

          bbox_scaled = {
              "xmin": int(xmin * x_scale),
              "ymin": int(ymin * y_scale),
              "xmax": int(xmax * x_scale),
              "ymax": int(ymax * y_scale)
          }

        labels_list.append(label)
        bb_list.append(bbox_scaled)


        return {"image": img_transformed, "labels": labels_list, "bbs": bb_list}

    def __getitem__(self,idx):
        anchor_img_path =self.file_list[idx]
        anchor_label = anchor_img_path.replace('\\', '/').split('/')[-2]

        anchor = self.load_image(anchor_img_path)

        # get Positive
        positive_indices = [i for i in self.label_to_indices[anchor_label] if i != idx]
        positive_idx = random.choice(positive_indices)
        positive_img_path = self.file_list[positive_idx]
        positive_label = positive_img_path.replace('\\', '/').split('/')[-2]

        positive = self.load_image(positive_img_path)

        # get Negative
        negative_label = random.choice([l for l in self.label_to_indices.keys() if l != anchor_label])
        negative_idx = random.choice(self.label_to_indices[negative_label])
        negative_img_path = self.file_list[negative_idx]

        negative = self.load_image(negative_img_path)



        return anchor, positive, negative


# Returns a list of paths divided into train, validation and test.
# the directory dir is scanned to find all brands which are then split into train, validation, test so that validation and test have proportion val_split, test_split
# if total_set_size is provided only that number of images are loaded so that for each brand at least min_images_per_brand images are present

def getPathsSetsByBrand(dir, val_split, test_split, total_set_size=None, min_images_per_brand=2):
    category_list = []
    brand_list = []

    for category in os.listdir(dir):
        category_path = os.path.join(dir, category)
        for brand in os.listdir(category_path):
           brand_path = os.path.join(category_path, brand)
           brand_list.append(brand_path)
        category_list.append(category_path)

    test_size = int(len(brand_list) * test_split)
    val_size = int(len(brand_list) * val_split)
    train_size = len(brand_list) - test_size - val_size

    generator = torch.Generator().manual_seed(SEED)
    train_subset, val_subset, test_subset = random_split(brand_list, [train_size, val_size, test_size], generator=generator)

    # random_split return a datase, not a list. To get the list of strings needed to pass to the Dataset we do the following:
    train_brand_list = [brand_list[i] for i in train_subset.indices]
    val_brand_list   = [brand_list[i] for i in val_subset.indices]
    test_brand_list  = [brand_list[i] for i in test_subset.indices]

    train_data_list = []
    val_data_list = []
    test_data_list = []

    if total_set_size is not None:
      images_per_brand = round(total_set_size / (len(train_brand_list) + len(val_brand_list) + len(test_brand_list)))

      print(f"Number of brands in training set: {len(train_brand_list)}")
      print(f"Number of brands in validation set: {len(val_brand_list)}")
      print(f"Number of brands in test set: {len(test_brand_list)}")
      print(f"images sampled per brand: {images_per_brand}")

      if images_per_brand < min_images_per_brand:
        # downscale the number of brands per set to guarantee min_images_per_brand

        print(f"not enough images per brand, resizing sets")

        new_total_brand_size = round(total_set_size / min_images_per_brand)
        new_val_brand_size = round(new_total_brand_size * val_split)
        new_test_brand_size = round(new_total_brand_size * test_split)
        new_train_brand_size = new_total_brand_size - new_val_brand_size - new_test_brand_size

        train_brand_list = random.sample(train_brand_list, new_train_brand_size)
        val_brand_list = random.sample(val_brand_list, new_val_brand_size)
        test_brand_list = random.sample(test_brand_list, new_test_brand_size)

        print(f"Number of brands in training set: {len(train_brand_list)}")
        print(f"Number of brands in validation set: {len(val_brand_list)}")
        print(f"Number of brands in test set: {len(test_brand_list)}")

        images_per_brand = min_images_per_brand
        print(f"new images sampled per brand: {images_per_brand}")

      for brand in train_brand_list:
        all_images = glob.glob(os.path.join(brand, '*.jpg'))
        if images_per_brand > len(all_images):
          print(f"images are less than {min_images_per_brand} for this brand: {brand}")
        sampled_images = random.sample(all_images, min(images_per_brand, len(all_images)))
        train_data_list.extend(sampled_images)

      for brand in val_brand_list:
        all_images = glob.glob(os.path.join(brand, '*.jpg'))
        if images_per_brand > len(all_images):
          print(f"images are less than {min_images_per_brand} for this brand: {brand}")
        sampled_images = random.sample(all_images, min(images_per_brand, len(all_images)))
        val_data_list.extend(sampled_images)

      for brand in test_brand_list:
        all_images = glob.glob(os.path.join(brand, '*.jpg'))
        if images_per_brand > len(all_images):
          print(f"images are less than {min_images_per_brand} for this brand: {brand}")
        sampled_images = random.sample(all_images, min(images_per_brand, len(all_images)))
        test_data_list.extend(sampled_images)
    else:
      for brand in train_brand_list:
        train_data_list.extend(glob.glob(os.path.join(brand, '*.jpg')))
      for brand in val_brand_list:
        val_data_list.extend(glob.glob(os.path.join(brand, '*.jpg')))
      for brand in test_brand_list:
        test_data_list.extend(glob.glob(os.path.join(brand, '*.jpg')))

    return train_data_list, val_data_list, test_data_list



def get_K_RandomImages(data_list, K=5):
  if K > len(data_list):
    raise ValueError("K cannot be larger than the size of data_list")
  return random.sample(data_list, K)

def visualizeImagesFromPathList(image_list, title="Images"):
    plt.figure(figsize=(15, 5))

    for i, img_path in enumerate(image_list):
        img = Image.open(img_path).convert("RGB")
        plt.subplot(1, len(image_list), i+1)
        plt.imshow(img)
        plt.title(img_path.split(os.sep)[-1])  # Show filename
        plt.axis('off')

    plt.suptitle(title)
    plt.show(block=True)


def main():
    random.seed(SEED)


    dir = "LogoDet-3K/LogoDet-3K"
    test_split = 1/10
    val_split = 1/10

    train_data_list, val_data_list, test_data_list = getPathsSetsByBrand(dir, val_split, test_split, 1000, 15)

    print(f"images in the training set: {len(train_data_list)}")
    print(f"images in the validation set: {len(val_data_list)}")
    print(f"images in the test set: {len(test_data_list)}")

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    train_dataset = DatasetTriplet(train_data_list, transform)
    val_dataset = DatasetTriplet(val_data_list, transform)
    test_dataset = DatasetTriplet(test_data_list, transform)

    sample_idx = random.randint(0, len(train_dataset) - 1)
    anchor, positive, negative = train_dataset[sample_idx]

    def show_triplet_with_bboxes(anchor, positive, negative):
        plt.figure(figsize=(15, 5))

        for i, item in enumerate([anchor, positive, negative]):
            img = item["image"]
            img = img.permute(1, 2, 0).numpy()
            img = img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]  # unnormalize
            img = img.clip(0, 1)

            ax = plt.subplot(1, 3, i+1)
            ax.imshow(img)
            plt.title(["Anchor", "Positive", "Negative"][i] + "\n" + ", ".join(item["labels"]))
            plt.axis('off')

            # Overlay bounding boxes
            for bbox in item["bbs"]:
                # Scale bounding box to resized image (224x224)
                x_scale = 224 / img.shape[1]
                y_scale = 224 / img.shape[0]
                rect = patches.Rectangle(
                    (bbox["xmin"] * x_scale, bbox["ymin"] * y_scale),
                    (bbox["xmax"] - bbox["xmin"]) * x_scale,
                    (bbox["ymax"] - bbox["ymin"]) * y_scale,
                    linewidth=2, edgecolor='r', facecolor='none'
                )
                ax.add_patch(rect)

        plt.suptitle("Sample Triplet with Bounding Boxes")
        plt.show()

    show_triplet_with_bboxes(anchor, positive, negative)



    return

#def diagnostic_main():
#    print("--- [1] INITIALIZATION & SETUP ---")
#    random.seed(SEED)
#    torch.manual_seed(SEED)
#
#    # Path to your dataset
#    dataset_dir = "LogoDet-3K/LogoDet-3K"
#
#    # Check if directory exists before proceeding to avoid crash
#    if not os.path.exists(dataset_dir):
#        print(f"ERROR: Directory '{dataset_dir}' not found. Please adjust the path.")
#        return
#
#    print(f"Data Source: {dataset_dir}")
#
#    # --- FEATURE TEST 1: Class-Disjoint Splitting ---
#    print("\n--- [2] TESTING SPLIT LOGIC (Feature: Zero-Overlap Brands) ---")
#
#    # Using strict parameters to force the logic to work hard
#    train_paths, val_paths, test_paths = getPathsSetsByBrand(
#        dataset_dir,
#        val_split=0.1,
#        test_split=0.1,
#        total_set_size=500,  # Small size for quick debugging
#        min_images_per_brand=5
#    )
#
#    # Extract Brand Names from paths to verify no overlap
#    def get_brands(path_list):
#        return set([p.replace('\\', '/').split('/')[-2] for p in path_list])
#
#    train_brands = get_brands(train_paths)
#    val_brands = get_brands(val_paths)
#    test_brands = get_brands(test_paths)
#
#    print(f"Unique Brands in Train: {len(train_brands)}")
#    print(f"Unique Brands in Val:   {len(val_brands)}")
#    print(f"Unique Brands in Test:  {len(test_brands)}")
#
#    # intersection verification
#    overlap = train_brands.intersection(test_brands)
#    if len(overlap) == 0:
#        print(" SUCCESS: No data leakage. Train and Test brands are completely disjoint.")
#    else:
#        print(f" FAILURE: Data leakage detected! Overlapping brands: {overlap}")
#
#    # --- FEATURE TEST 2: Triplet Logic ---
#    print("\n--- [3] TESTING TRIPLET INTEGRITY (Feature: A-P same class, A-N different) ---")
#
#    transform = transforms.Compose([
#        transforms.Resize((224, 224)),
#        transforms.ToTensor()
#    ])
#
#    # Create the dataset
#    train_ds = DatasetTriplet(train_paths, transform)
#
#    # Fetch a random sample
#    idx = random.randint(0, len(train_ds)-1)
#    anchor, positive, negative = train_ds[idx]
#
#    # Check 1: Anchor and Positive must have the same label
#    label_A = anchor['labels'][0]
#    label_P = positive['labels'][0]
#    label_N = negative['labels'][0]
#
#    print(f"Sample Index: {idx}")
#    print(f"Anchor Label:   {label_A}")
#    print(f"Positive Label: {label_P}")
#    print(f"Negative Label: {label_N}")
#
#    if label_A == label_P:
#        print(" SUCCESS: Anchor and Positive are the same class.")
#    else:
#        print(" FAILURE: Anchor and Positive mismatch!")
#
#    if label_A != label_N:
#        print(" SUCCESS: Anchor and Negative are different classes.")
#    else:
#        print(" FAILURE: Anchor and Negative are the same class (Hard Negative or Error)!")
#
#    # --- FEATURE TEST 3: Bounding Box Rescaling ---
#    print("\n--- [4] TESTING BBOX RESCALING (Feature: Coordinates matching 224x224) ---")
#
#    # The image is resized to 224x224. BBoxes must be within [0, 224].
#    bbox_A = anchor['bbs'][0]
#    print(f"Resized BBox coordinates: {bbox_A}")
#
#    valid_coords = (0 <= bbox_A['xmin'] < bbox_A['xmax'] <= 224) and \
#                   (0 <= bbox_A['ymin'] < bbox_A['ymax'] <= 224)
#
#    if valid_coords:
#        print(" SUCCESS: Bounding box coordinates are valid within the 224x224 tensor.")
#    else:
#        print(" FAILURE: Bounding box coordinates are out of bounds or inverted.")
#
#    print("\n--- TEST COMPLETE ---")
#
#    # Optional: Visualize the verified triplet
#    # show_triplet_with_bboxes(anchor, positive, negative)
#
#if __name__ == "__main__":
#    diagnostic_main()
