In [38]:
import matplotlib.pyplot as plt

from nuscenes.prediction.input_representation.static_layers import StaticLayerRasterizer
from nuscenes.prediction.input_representation.agents import AgentBoxesWithFadedHistory
from nuscenes.prediction.input_representation.interface import InputRepresentation
from nuscenes.prediction.input_representation.combinators import Rasterizer

from nuscenes.prediction import PredictHelper
from nuscenes.utils.splits import create_splits_scenes

from nuscenes import NuScenes
import os
import json
from itertools import chain
from typing import List

import shutil

import torch
import numpy as np
import shutil
from PIL import Image
from pathlib import Path
from collections import defaultdict
from transformers import CLIPProcessor, CLIPModel
import argparse
from tqdm.auto import tqdm
from annoy import AnnoyIndex
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
import pickle
import random

from folder_creator import sample_extractor

### Cluster and batch handling

In [45]:
import os
import shutil
from pathlib import Path
import clip
import torch
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

#Load the model and preprocess
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

class ImageDataset(Dataset):
    def __init__(self, root_dir, classes, transform=None):
        self.root_dir = Path(root_dir)
        self.classes = classes
        self.transform = transform
        self.files = [f for f in self.root_dir.iterdir() if f.suffix in ['.jpg', '.png', '.jpeg']]

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        img_path = self.files[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, str(img_path)

def cluster_and_organize(cluster_path, new_cluster_path, threshold, classes, batch_size=32):
    # Download the dataset
    if not os.path.exists(cluster_path):
            os.makedirs(cluster_path)

    # Download the dataset
    if not os.path.exists(new_cluster_path):
            os.makedirs(new_cluster_path)

    transform = transforms.Compose([
        preprocess,
        lambda x: x.unsqueeze(0)
    ])
    dataset = ImageDataset(cluster_path, classes, transform=transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in classes]).to(device)
    text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    for images, paths in tqdm(dataloader):
        images = torch.cat(list(images), dim=0).to(device)
        with torch.no_grad():
            image_features = model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            values , indices = torch.max(similarity, dim=-1)
        for path, index, value in zip(paths, indices, values):
            predicted_class = classes[index]
            # if value < threshold:
            #     predicted_class = 'unique'
            destination_dir = Path(new_cluster_path) / predicted_class
            destination_dir.mkdir(exist_ok=True)
            shutil.copy(path, destination_dir)

    print(f"Images have been organized into clusters based on their top predictions.")

### View amount of samples in each cluster

In [46]:
import os

def cluster_length(cluster_path, classes):
    cluster_path = Path(cluster_path)
    # Download the dataset
    # if not os.path.exists(cluster_path):
    #         os.makedirs(cluster_path)
    

    for c in classes:
        full_path = cluster_path / c
        print(f"{c} length: {len(os.listdir(full_path))}")

### Cluster each class

In [47]:
classes = [#'car',
                #'small car',
                #'big car',
                'bicycle',
                'motorcycle', 
                'truck',
                'trailer',
                #'person',
                #'barricade',
                'traffic cone',
                'bus',
                'construction vehicle'
                ]

def copy_folder_contents(source_folder, destination_folder):
    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # Iterate over the contents of the source folder
    for item in os.listdir(source_folder):
        # Form the paths to the source and destination items
        source_item_path = os.path.join(source_folder, item)
        destination_item_path = os.path.join(destination_folder, item)

        # If it's a file, copy it directly
        if os.path.isfile(source_item_path):
            shutil.copy2(source_item_path, destination_item_path)
        # If it's a folder, recursively copy its contents
        elif os.path.isdir(source_item_path):
            copy_folder_contents(source_item_path, destination_item_path)

# Main function to process images and cluster them based on conceptual similarity using CLIP embeddings.
def process_images(image_directory, clip_model, threshold, batch_size):
    image_directory = Path(image_directory)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    embeddings_file = image_directory / 'embeddings.npy'
    regenerate_embeddings = check_and_load_embeddings(embeddings_file)

    # Load CLIP model and processor
    model = CLIPModel.from_pretrained(clip_model).to(device)
    processor = CLIPProcessor.from_pretrained(clip_model)
    allowed_extensions = {".jpeg", ".jpg", ".png", ".webp"}

    images_to_paths, all_image_ids = get_images_to_paths(image_directory, allowed_extensions)
    damaged_image_ids, all_embeddings = generate_embeddings(all_image_ids, images_to_paths, model, processor, device, batch_size, regenerate_embeddings, embeddings_file)

    if regenerate_embeddings:
        np.save(embeddings_file, all_embeddings)

    print("Building Annoy index...")
    annoy_index = build_annoy_index(all_embeddings)

    print("Computing distance matrix...")
    distances = compute_distance_matrix(all_embeddings, annoy_index)

    print("Applying hierarchical clustering...")
    labels = apply_clustering(distances, threshold)

    image_id_clusters = build_image_clusters(all_image_ids, labels)
    organize_images(images_to_paths, image_directory, image_id_clusters, damaged_image_ids)

# Check for existing embeddings file and load it if found, otherwise generate new embeddings
def check_and_load_embeddings(embeddings_file):
    if embeddings_file.exists():
        use_existing_embeddings = input("Embeddings file found. Do you want to use existing embeddings? (Y/N) ").strip().lower()
        if use_existing_embeddings in ('', 'y', 'yes'):
            print("Loading embeddings from file...")
            all_embeddings = np.load(embeddings_file)
            return False
    return True

# Get the paths of all images in the given directory and return the image ids and their paths
def get_images_to_paths(image_directory, allowed_extensions):
    images_to_paths = {
        image_path.stem: image_path
        for image_path in image_directory.iterdir()
        if image_path.suffix.lower() in allowed_extensions
    }
    return images_to_paths, list(images_to_paths.keys())

# Generate CLIP embeddings for all images, handling damaged images if any
def generate_embeddings(all_image_ids, images_to_paths, model, processor, device, batch_size, regenerate_embeddings, embeddings_file):
    if not regenerate_embeddings:
        return set(), np.load(embeddings_file)

    damaged_image_ids, all_embeddings = set(), []
    progress_bar = tqdm(total=len(all_image_ids), desc="Generating CLIP embeddings")

    for i in range(0, len(all_image_ids), batch_size):
        batch_image_ids, batch_images = process_image_batch(all_image_ids, i, batch_size, images_to_paths, damaged_image_ids)
        inputs = processor(images=batch_images, return_tensors="pt", padding=True).to(device)

        with torch.no_grad():
            outputs = model.get_image_features(**inputs)

        all_embeddings.extend(outputs.cpu().numpy())
        progress_bar.update(len(batch_image_ids))

    progress_bar.close()
    return damaged_image_ids, all_embeddings

# Process a batch of images, returning their ids and loaded images, while identifying damaged images
def process_image_batch(all_image_ids, start_idx, batch_size, images_to_paths, damaged_image_ids):
    batch_image_ids = all_image_ids[start_idx: start_idx + batch_size]
    batch_images = []

    for image_id in batch_image_ids:
        try:
            image = Image.open(images_to_paths[image_id])
            image.load()
            batch_images.append(image)
        except OSError:
            print(f"\nError processing image {images_to_paths[image_id]}, marking as corrupted.")
            damaged_image_ids.add(image_id)

    return batch_image_ids, batch_images

# Build an Annoy index using the generated CLIP embeddings
def build_annoy_index(all_embeddings):
    embeddings = np.array(all_embeddings)
    n_dimensions = embeddings.shape[1]

    annoy_index = AnnoyIndex(n_dimensions, "angular")
    for i, embedding in enumerate(embeddings):
        annoy_index.add_item(i, embedding)

    annoy_index.build(100)
    return annoy_index

# Compute the distance matrix of the embeddings using the Annoy index
def compute_distance_matrix(all_embeddings, annoy_index):
    n = len(all_embeddings)
    distances = []

    for i in range(n):
        for j in range(i + 1, n):
            distance = annoy_index.get_distance(i, j)
            distances.append(distance)

    return distances

# Apply hierarchical clustering on the computed distance matrix with the given threshold
def apply_clustering(distances, threshold):
    condensed_distances = np.array(distances)
    Z = linkage(condensed_distances, method='average', optimal_ordering=True)
    return fcluster(Z, t=threshold, criterion='distance')

# Build clusters of image ids based on the clustering labels
def build_image_clusters(all_image_ids, labels):
    image_id_clusters = defaultdict(set)

    for image_id, cluster_label in zip(all_image_ids, labels):
        image_id_clusters[cluster_label].add(image_id)

    return image_id_clusters

# Organize images into separate folders for clusters, unique images, and corrupted images
def organize_images(images_to_paths, image_directory, image_id_clusters, damaged_image_ids):
    for idx, image_id_cluster in enumerate(image_id_clusters.values()):
        if len(image_id_cluster) < 2:
            continue

        move_images_to_directory(image_directory, f"cluster_{idx}", image_id_cluster, images_to_paths)

    unique_image_ids = set(images_to_paths.keys()) - set(damaged_image_ids) - {image_id for cluster in image_id_clusters.values() for image_id in cluster if len(cluster) >= 2}
    move_images_to_directory(image_directory, "unique", unique_image_ids, images_to_paths)

    if damaged_image_ids:
        move_images_to_directory(image_directory, "corrupted", damaged_image_ids, images_to_paths)

# Move images to the specified folder within the image_directory
def move_images_to_directory(image_directory, folder_name, image_ids, images_to_paths):
    output_directory = image_directory / folder_name
    output_directory.mkdir(parents=True, exist_ok=True)

    for image_id in image_ids:
        source = images_to_paths[image_id]
        destination = output_directory / source.name
        shutil.move(source, destination)


def count_folders_and_images(directory):
    folder_count = 0
    image_count = 0
    valid_new_data = []

    # Iterate over all items in the directory
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        # If it's a directory, increment folder_count
        if os.path.isdir(item_path) and item != "unique":
            folder_path_name = os.path.join(directory, item)
            for unique_item in os.listdir(folder_path_name):
                valid_new_data += [unique_item[:unique_item.index(".")]]
                break
        # If it's a file and it's in the "unique" folder, check if it's an image
        elif os.path.isdir(item_path) and item == "unique":
            unique_folder = os.path.join(directory, item)
            for unique_item in os.listdir(unique_folder):
                unique_item_path = os.path.join(unique_folder, unique_item)
                if os.path.isfile(unique_item_path):
                    if unique_item.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                        valid_new_data += [unique_item[:unique_item.index(".")]]

    return valid_new_data


def count_folders_and_images_not_found_in_training(directory, training_names):
    # Check a folder.
    # If no contents are already in the training set, then add these to the count.
    # If they are in the training set, move to next folder.
    folder_count = 0
    image_count = 0
    set_tnames = set(training_names)
    valid_new_data = []
    # Iterate over all items in the directory
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        # If it's a directory, increment folder_count
        if os.path.isdir(item_path) and item != "unique":
            folder_path_name = os.path.join(directory, item)
            folder_items = []
            for unique_item in os.listdir(folder_path_name):
                unique_item_path = os.path.join(unique_folder, unique_item)
                if os.path.isfile(unique_item_path):
                    if unique_item.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                        unique_item_training_name = unique_item[:unique_item.index(".")]
                        folder_items += [unique_item_training_name]
            # Find intersection of folder_items with training_names
            set1 = set(folder_items)
   

            # Find the intersection of the two sets
            intersection = set1.intersection(set_tnames)

            if len(intersection) == 0:
                valid_new_data += [unique_item[:unique_item.index(".")] for unique_item in os.listdir(folder_path_name)]
           
        # If it's a file and it's in the "unique" folder, check if it's an image
        elif os.path.isdir(item_path) and item == "unique":
            unique_folder = os.path.join(directory, item)
            for unique_item in os.listdir(unique_folder):
                unique_item_path = os.path.join(unique_folder, unique_item)
                if os.path.isfile(unique_item_path):
                    if unique_item.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')) and unique_item[:unique_item.index(".")] not in training_names:
                        valid_new_data += [unique_item[:unique_item.index(".")]]

    return valid_new_data



In [48]:
# Source folder should contain all images you would like to cluster (and nothing else)
# Destination folder will be created to hold the clustering results.
orig_path = "active_data/nuScenes_active"
source_path = "/home/cvrr/Desktop/VLLM/active_data/zero-shot-clust/un_clustered"
dest_path = "/home/cvrr/Desktop/VLLM/active_data/zero-shot-clust/clustered"


classes = [#'car',
                #'small car',
                #'big car',
                #'bicycle',
                'bike',
                'motorcycle', 
                'construction vehicle',
                'bus',
                'traffic cone',
                'truck',
                'trailer',
                #'person',
                #'barricade',
                ]

cluster_and_organize(orig_path, source_path, 20, classes, batch_size=96)

cluster_length(source_path, classes)


for p in classes:

    source_folder = os.path.join(source_path, p)
    destination_folder = os.path.join(dest_path, f"{p}_clustored")


    print("Copying Images for Clustering")

    copy_folder_contents(source_folder, destination_folder)

    # Run the clustering algorithm with threshold t

    experiment_image_directory = destination_folder


    t_val = .5
    #I like:
    #.6
    #.5
    # .8 gave 1
    # .7 gave 1
    # .2 gave 19841 unique

    experiment_image_directory = destination_folder

    # First batch
    
    process_images(experiment_image_directory, "openai/clip-vit-large-patch14-336", t_val, 96)
print('DONE!!!')

100%|██████████| 294/294 [04:56<00:00,  1.01s/it]


Images have been organized into clusters based on their top predictions.
bike length: 919
motorcycle length: 476
construction vehicle length: 1824
bus length: 4638
traffic cone length: 7967
truck length: 3618
trailer length: 8688
Copying Images for Clustering


Generating CLIP embeddings: 100%|██████████| 919/919 [00:24<00:00, 36.82it/s]


Building Annoy index...
Computing distance matrix...
Applying hierarchical clustering...
Copying Images for Clustering


Generating CLIP embeddings: 100%|██████████| 476/476 [00:12<00:00, 37.01it/s]


Building Annoy index...
Computing distance matrix...
Applying hierarchical clustering...
Copying Images for Clustering


Generating CLIP embeddings: 100%|██████████| 1824/1824 [00:49<00:00, 36.99it/s]


Building Annoy index...
Computing distance matrix...
Applying hierarchical clustering...
Copying Images for Clustering


Generating CLIP embeddings: 100%|██████████| 4638/4638 [02:05<00:00, 36.84it/s]


Building Annoy index...
Computing distance matrix...
Applying hierarchical clustering...
Copying Images for Clustering


Generating CLIP embeddings: 100%|██████████| 7967/7967 [03:38<00:00, 36.54it/s]


Building Annoy index...
Computing distance matrix...
Applying hierarchical clustering...
Copying Images for Clustering


Generating CLIP embeddings: 100%|██████████| 3618/3618 [01:37<00:00, 37.21it/s]


Building Annoy index...
Computing distance matrix...
Applying hierarchical clustering...
Copying Images for Clustering


Generating CLIP embeddings: 100%|██████████| 8688/8688 [03:56<00:00, 36.75it/s]


Building Annoy index...
Computing distance matrix...
Applying hierarchical clustering...
DONE!!!


### Diversity Sampling (First Round)

In [68]:
import os
import json

train_path = 'data/nuscenes/v1.0-trainval'
test_path = 'data/nuscenes/v1.0-test'
LLM_path = 'active_data/nuScenes_active'
al_path = 'data/nuscenes'


def file_opener(train_path):
    # Load and prepare data
    with open(os.path.join(train_path, 'orig_scene.json'), 'r') as f:
        scene_data = json.load(f)
    with open(os.path.join(train_path, 'orig_sample_data.json'), 'r') as f:
        sample_data = json.load(f)
    with open(os.path.join(train_path, 'orig_sample.json'), 'r') as f:
        samples = json.load(f)

    # Create mappings for quick lookup
    filename_to_sample_token = {obj['filename'].split('/')[-1]: obj['sample_token'] for obj in sample_data}
    sample_token_to_scene_token = {obj['token']: obj['scene_token'] for obj in samples}
    scene_token_to_name = {obj['token']: obj['name'] for obj in scene_data}

    return filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name

def sample_extractor(sampled_image, filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name):
    # Lookup process
    sample_token = filename_to_sample_token.get(sampled_image)
    scene_token = sample_token_to_scene_token.get(sample_token)
    scene_name = scene_token_to_name.get(scene_token)

    # Output results
    # print(f"Sample token: {sample_token}")
    # print(f"Scene token: {scene_token}")
    #print(f"Scene name: {scene_name}")
    return scene_name

In [69]:
def first_round_sampling_dynamic_selection(root_folder, chosen_folder, num_folders_to_select, train_path):
    os.makedirs(os.path.join(root_folder, chosen_folder), exist_ok=True)
    chosen_images = []
    scene_names_set = set()

    filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name = file_opener(train_path)

    for class_folder in os.listdir(root_folder):
        class_path = os.path.join(root_folder, class_folder)
        all_cluster_folders = [name for name in os.listdir(class_path) if os.path.isdir(os.path.join(class_path, name))]
        random.shuffle(all_cluster_folders)
        unvisited_cluster_folders = all_cluster_folders.copy()
        selected_scenes_count = 0

        while selected_scenes_count < num_folders_to_select and unvisited_cluster_folders:
            selected_cluster_folder = unvisited_cluster_folders.pop(0)
            selected_cluster_path = os.path.join(class_path, selected_cluster_folder)
            images = [name for name in os.listdir(selected_cluster_path) if os.path.isfile(os.path.join(selected_cluster_path, name))]
            random.shuffle(images)
            image_found = False

            for sampled_image in images:
                scene_name = sample_extractor(sampled_image, filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name)

                if scene_name not in scene_names_set:
                    image_path = os.path.join(selected_cluster_path, sampled_image)
                    shutil.copy(image_path, os.path.join(root_folder, chosen_folder, sampled_image))
                    chosen_images.append((sampled_image, class_folder, scene_name))
                    scene_names_set.add(scene_name)
                    selected_scenes_count += 1
                    image_found = True
                    break  # Exit after finding a unique scene
            if not image_found:  # If no unique image was found, continue to the next folder
                continue  # This will automatically move to the next iteration of the while loop

        if selected_scenes_count < num_folders_to_select:
            print(f"Warning: Only {selected_scenes_count} unique scenes were found for class '{class_folder}', less than the desired {num_folders_to_select}.")

    print("Total unique scenes selected:", len(scene_names_set))
    return chosen_images, scene_names_set

### Diversity Sampling (Subsequaint Rounds)

In [111]:
def count_images_in_folder(folder_path):
    """
    Counts the number of image files in a given folder.

    :param folder_path: Path to the folder.
    :return: The count of image files.
    """
    return len([name for name in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, name))])

def process_folder(folder_path, class_folder, chosen_images, scene_names_set, num_folders_to_select, filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name, root_folder, chosen_folder):
    import os
    import shutil
    import random

    images = [name for name in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, name))]
    random.shuffle(images)
    scenes_added = 0  # Track the number of new scenes added

    for sampled_image in images:
        scene_name = sample_extractor(sampled_image, filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name)
        before_add_size = len(scene_names_set)  # Size of the set before attempting to add
        scene_names_set.add(scene_name)  # Attempt to add the new scene
        after_add_size = len(scene_names_set)  # Size of the set after attempting to add

        if after_add_size > before_add_size:
            # This means a new scene was successfully added
            image_path = os.path.join(folder_path, sampled_image)
            #shutil.copy(image_path, os.path.join(root_folder, chosen_folder, sampled_image))
            chosen_images.append((sampled_image, os.path.basename(os.path.dirname(folder_path)), scene_name))
            scenes_added += 1  # Increment only if a new scene was added

            if scenes_added >= num_folders_to_select:
                break  # Stop if the desired number of unique scenes is reached
    if scenes_added < num_folders_to_select:
        print(f"Warning: Only {scenes_added} unique scenes were found for class '{class_folder}', less than the desired {num_folders_to_select}.")

    return scenes_added

#The main function with added logic to prioritize the "unique" folder and sort remaining folders by image count
def vislling_samples(root_folder, chosen_folder, num_folders_to_select, train_path, scene_names_set_prior):
    import os
    import shutil
    import random
    print("vislling being processes")
    print("amount of scenes present at start: ", len(scene_names_set_prior))
    

    os.makedirs(os.path.join(root_folder, chosen_folder), exist_ok=True)
    chosen_images = []
    scene_names_set = set(scene_names_set_prior)
    print("Len Scene names", len(scene_names_set))

    filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name = file_opener(train_path)

    #scene_name = sample_extractor(sampled_image, filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name)


    for class_folder in os.listdir(root_folder):
        new_scenes_added = 0
        class_path = os.path.join(root_folder, class_folder)
        all_cluster_folders = [name for name in os.listdir(class_path) if os.path.isdir(os.path.join(class_path, name))]
        random.shuffle(all_cluster_folders)
        #print(class_folder)
        #print(all_cluster_folders)
        

        # #Calculate the remaining number of scenes to select for this class
        # current_class_scenes = sum(1 for img in scene_names_set if img[1] == class_folder)  # Assuming each scene is a tuple (image, class, scene_name)
        # num_folders_to_select = total_scenes_desired - current_class_scenes

        # Process the "unique" folder first if it exists
        if "unique" in all_cluster_folders:
            all_cluster_folders.remove("unique")
            #process_folder(os.path.join(class_path, "unique"), chosen_images, scene_names_set, num_folders_to_select)
            new_scenes_added += process_folder(os.path.join(class_path, "unique"), class_folder, chosen_images, scene_names_set, num_folders_to_select, filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name, root_folder, chosen_folder)

        # Sort remaining folders by image count
        sorted_folders = sorted(all_cluster_folders, key=lambda folder: count_images_in_folder(os.path.join(class_path, folder)))

        #Process sorted folders
        for folder in sorted_folders:
            #print(folder)
            if new_scenes_added >= num_folders_to_select:
                #print(new_scenes_added)
                break  # Exit if we've reached the desired scene count
            print(f"processing scenes from other folders, for class {class_folder}")
            new_scenes_added += process_folder(os.path.join(class_path, folder), class_folder, chosen_images, scene_names_set, num_folders_to_select, filename_to_sample_token, sample_token_to_scene_token, scene_token_to_name, root_folder, chosen_folder)

        #print(f"Total unique scenes selected for class '{class_folder}': {len(scene_names_set)}")

    print("Overall total unique scenes selected:", len(scene_names_set))
    return chosen_images, scene_names_set



#Note: Actual implementations for file_opener, sample_extractor, and process_folder need to be defined.
#These stubs are placeholders for the logic that would be implemented based on the user's existing code and requirements.

In [112]:
# Define paths
root_folder = "/home/cvrr/Desktop/VLLM/active_data/zero-shot-clust/clustered"  # Root folder containing class folders
chosen_folder = 'chosen'  # Folder to save the chosen images
num_folders_to_select = 14  # Number of cluster folders to select
train_path = 'data/nuscenes/v1.0-trainval'

#chosen_images = first_round_sampling(root_folder, chosen_folder, num_folders_to_select, train_path)
chosen_images, scene_names_set = first_round_sampling_dynamic_selection(root_folder, chosen_folder, num_folders_to_select, train_path)
#print(scene_names_set)
#print(chosen_images)

with open('splits/10p_diversity_split.json', 'w') as f:
    json.dump(list(scene_names_set), f)

scene_names_set_prior = scene_names_set
num_folders_to_selects = [15, 14, 14, 15]  # Number of cluster folders to select


pers = [20, 30, 40, 50]

for per, num_folders_to_select in zip(pers, num_folders_to_selects):
    chosen_images, scene_names_set = vislling_samples(root_folder, chosen_folder, num_folders_to_select, train_path, scene_names_set_prior)

    with open(f'splits/{per}p_diversity_split.json', 'w') as f:
        json.dump(list(scene_names_set), f)
    scene_names_set_prior = scene_names_set

#sample_extractor(chosen_images)

Total unique scenes selected: 98
vislling being processes
amount of scenes present at start:  98
Len Scene names 98
Overall total unique scenes selected: 203
vislling being processes
amount of scenes present at start:  203
Len Scene names 203
Overall total unique scenes selected: 301
vislling being processes
amount of scenes present at start:  301
Len Scene names 301
Overall total unique scenes selected: 399
vislling being processes
amount of scenes present at start:  399
Len Scene names 399
processing scenes from other folders, for class motorcycle_clustored
processing scenes from other folders, for class motorcycle_clustored
processing scenes from other folders, for class motorcycle_clustored
processing scenes from other folders, for class motorcycle_clustored
processing scenes from other folders, for class motorcycle_clustored
processing scenes from other folders, for class motorcycle_clustored
processing scenes from other folders, for class motorcycle_clustored
processing scenes fr

### Diversity Sampling (Other Rounds)

In [55]:
classes = [#'car',
                #'small car',
                #'big car',
                'bike',
                'motorcycle', 
                'truck',
                'trailer',
                'person',
                #'barricade',
                'traffic cone',
                'bus',
                'construction vehicle'
                ]

clust_classes = [#'car',
                #'small car',
                #'big car',
                'bike_clustored',
                'motorcycle_clustored', 
                'truck_clustored',
                'trailer_clustored',
                'person_clustored',
                #'barricade',
                'traffic cone_clustored',
                'bus_clustored',
                'construction vehicle_clustored'
                ]

clust_path = "/home/cvrr/Desktop/VLLM/active_data/zero-shot-clust/clustered"
path = "/home/cvrr/Desktop/VLLM/active_data/zero-shot-clust/un_clustered"


#cluster_and_organize("active_data/nuScenes_active", "active_data/zero-shot-clust", 20 ,classes)

#cluster_length(clust_path, clust_classes)
cluster_length(path, classes)

bike length: 758
motorcycle length: 298
truck length: 3255
trailer length: 7722
person length: 3117
traffic cone length: 7036
bus length: 4165
construction vehicle length: 1779
