This notebook is used to reorder the task sequence and group similar/dissimilar classes within task

In [None]:
from dreamsim import dreamsim
from PIL import Image
import os
import torch

cache_dir = os.path.expanduser("~/.cache")
model, preprocess = dreamsim(pretrained=True, cache_dir=cache_dir)

Using cached /homes/55/enbo/.cache


Using cache found in /homes/55/enbo/.cache/facebookresearch_dino_main
  "fan_in_fan_out is set to True but the target module is not a Conv1D. "


In [None]:
def compare_two_datasets(folder1, folder2):
    """
    this function can be used to compare the perceptual distance between two
    classes of images saving at different folders
    """
    all_files1 = os.listdir(folder1)
    all_files2 = os.listdir(folder2)

    all_non_txt_files1 = [file for file in all_files1 if (file.endswith('.jpg') or file.endswith('.png'))]
    all_non_txt_files2 = [file for file in all_files2 if (file.endswith('.jpg') or file.endswith('.png'))]

    if set(all_non_txt_files1) != set(all_non_txt_files2):
        print("items in the two folders do not match")
        return
    else:
        print("items in the two folders match, going to measure...")

    total_distance = 0
    for i, files in enumerate(all_non_txt_files1):
        img1 = preprocess(Image.open(os.path.join(folder1, files)))
        img2 = preprocess(Image.open(os.path.join(folder2, files)))
        if i%20 == 0:
            print(i)

        if torch.cuda.is_available():
            img1 = img1.cuda()
            img2 = img2.cuda()

        distance = model(img1, img2)
        total_distance += distance.item()

    return total_distance


In [None]:
from torchvision import datasets, transforms
from torchvision.transforms.functional import pil_to_tensor
from torchvision.utils import save_image

In [None]:
test_dataset = datasets.MNIST('data', train=False, download=True)
# from the mnist test_dataset to extract the class names
name_list = test_dataset.classes

In [None]:
name_list

['0 - zero',
 '1 - one',
 '2 - two',
 '3 - three',
 '4 - four',
 '5 - five',
 '6 - six',
 '7 - seven',
 '8 - eight',
 '9 - nine']

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from PIL import Image
import os

transform =transforms.Compose([
                        #  transforms.RandomHorizontalFlip(),
                         transforms.ToTensor()
])

In [None]:
import os
from torchvision.transforms.functional import to_pil_image, to_tensor
from torchvision.datasets import CIFAR100
from torchvision.utils import save_image
import torchvision.transforms as transforms

def save_mnist(dataset, num_images_per_class, save_dir):
    """
    this function is used to save samples of images from the dataset for comparison

    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    saved_counts = {label: 0 for label in range(10)}  # Initialize saved image count for each class

    for image, label in dataset:
        if saved_counts[label] >= num_images_per_class:
            continue

        # Convert tensor image to PIL image and back to tensor to normalize
        img = to_pil_image(image).convert("RGB")
        image_tensor = to_tensor(img)

        class_name = dataset.classes[label].replace('/', '_')
        class_dir = os.path.join(save_dir, class_name)

        if not os.path.exists(class_dir):
            os.makedirs(class_dir)

        filename = f'image{saved_counts[label]}.png'
        image_path = os.path.join(class_dir, filename)

        save_image(image_tensor, image_path)
        saved_counts[label] += 1

        class_file_path = os.path.join(save_dir, f"{class_name}.txt")
        with open(class_file_path, "a") as file:
            file.write(f"{image_path} {label}\n")

        if all(count >= num_images_per_class for count in saved_counts.values()):
            break

    print(f"Saved {num_images_per_class} images per class from the MNiST training dataset.")


In [None]:
trainset = torchvision.datasets.MNIST(root='data', train=True,
                                         download=True, transform=transform)

# save 20 images from the trainset for each class for comparison
save_mnist(trainset, 20, 'mnist_10_0407')


Saved 20 images per class from the MNiST training dataset.


In [None]:

import os
from PIL import Image
import torchvision.transforms as transforms
import torch
from tqdm import tqdm

def compare_class_pairs(base_dir, num_images=10):
    classes = os.listdir(base_dir)
    classes = [file for file in classes if not file.endswith('.txt')]
    classes.sort()  # Ensure consistent order

    distances = {}
    # Use tqdm to track progress over classes
    for i, class1 in tqdm(enumerate(classes[:-1]), total=len(classes[:-1]), desc="Comparing classes"):
        for class2 in classes[i+1:]:
            class1_dir = os.path.join(base_dir, class1)
            class2_dir = os.path.join(base_dir, class2)
            total_distance = 0
            for idx in range(num_images):
                img1_path = os.path.join(class1_dir, f"image{idx}.png")
                img2_path = os.path.join(class2_dir, f"image{idx}.png")

                img1 = preprocess(Image.open(img1_path).convert("RGB"))
                img2 = preprocess(Image.open(img2_path).convert("RGB"))

                if torch.cuda.is_available():
                    img1 = img1.cuda()
                    img2 = img2.cuda()

                distance = model(img1, img2)
                total_distance += distance

            average_distance = total_distance / num_images
            distances[f"{class1} vs {class2}"] = average_distance

    return distances



In [None]:
re = compare_class_pairs('mnist_10_0407', num_images=20)

Comparing classes: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:43<00:00,  4.78s/it]


In [None]:
re

{'0 - zero vs 1 - one': tensor([0.3298], device='cuda:0'),
 '0 - zero vs 2 - two': tensor([0.2269], device='cuda:0'),
 '0 - zero vs 3 - three': tensor([0.2288], device='cuda:0'),
 '0 - zero vs 4 - four': tensor([0.3223], device='cuda:0'),
 '0 - zero vs 5 - five': tensor([0.2327], device='cuda:0'),
 '0 - zero vs 6 - six': tensor([0.2089], device='cuda:0'),
 '0 - zero vs 7 - seven': tensor([0.2764], device='cuda:0'),
 '0 - zero vs 8 - eight': tensor([0.2260], device='cuda:0'),
 '0 - zero vs 9 - nine': tensor([0.2293], device='cuda:0'),
 '1 - one vs 2 - two': tensor([0.2964], device='cuda:0'),
 '1 - one vs 3 - three': tensor([0.3195], device='cuda:0'),
 '1 - one vs 4 - four': tensor([0.2813], device='cuda:0'),
 '1 - one vs 5 - five': tensor([0.2837], device='cuda:0'),
 '1 - one vs 6 - six': tensor([0.2917], device='cuda:0'),
 '1 - one vs 7 - seven': tensor([0.2585], device='cuda:0'),
 '1 - one vs 8 - eight': tensor([0.3159], device='cuda:0'),
 '1 - one vs 9 - nine': tensor([0.2939], devic

In [None]:

# total_sum = torch.tensor([0.0], device='cuda:0')
# for value in re.values():
#     total_sum += value
# average = total_sum / len(re)

# print("Average of the tensors:", average.item())


Average of the tensors: 0.24127596616744995


In [None]:
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import squareform
from sklearn.manifold import MDS

In [None]:
num_classes = len(name_list)
distance_matrix = np.zeros((num_classes, num_classes))


for i, class1 in enumerate(name_list):
    for j, class2 in enumerate(name_list):
        if i == j:
            continue
        key = f"{class1} vs {class2}" if f"{class1} vs {class2}" in re else f"{class2} vs {class1}"
        distance_matrix[i, j] = re[key]

# # Since KMeans doesn't work directly with a distance matrix, we can use MDS to convert our distances into points
# mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
# points = mds.fit_transform(distance_matrix)

# # Cluster the points into 5 clusters
# kmeans = KMeans(n_clusters=5, random_state=42).fit(points)
# labels = kmeans.labels_

# # Organize classes into groups based on cluster assignment
# groups = {i: [] for i in range(5)}
# for class_index, group_index in enumerate(labels):
#     groups[group_index].append(name_list[class_index])

# # Display the groups
# for group, group_classes in groups.items():
#     print(f"Group {group + 1}: {', '.join(group_classes)}")


Group 1: 2 - two, 3 - three
Group 2: 5 - five, 8 - eight, 9 - nine
Group 3: 1 - one
Group 4: 0 - zero, 6 - six
Group 5: 4 - four, 7 - seven


In [None]:
n1, n2 = 1, 6
num_task = 5
num_classes = 10

In [None]:
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

# Convert the distance matrix to condensed form since linkage function expects this format
condensed_distance_matrix = squareform(distance_matrix)

# Perform hierarchical clustering
Z = linkage(condensed_distance_matrix, 'average')

initial_clusters = fcluster(Z, t=num_task, criterion='maxclust')
clusters = {i: [] for i in range(n1, n2)}  # Adjust based on actual initial cluster ids

# Assign classes to initial clusters
for class_index, cluster_id in enumerate(initial_clusters):
    clusters[cluster_id].append(name_list[class_index])

# Function to adjust clusters to have exactly five members
def adjust_clusters(clusters):
    final_groups = {}
    group_id = 0
    temp_group = []

    for cluster_id, members in clusters.items():
        for member in members:
            temp_group.append(member)
            if len(temp_group) == int(num_classes/num_task):  # Once we have 5 members, save the group and reset
                final_groups[group_id] = temp_group
                group_id += 1
                temp_group = []

    return final_groups

# Adjust the clusters to ensure each has exactly five classes
final_groups = adjust_clusters(clusters)

# Display the groups
for group_id, group_classes in final_groups.items():
    print(f"Group {group_id + 1}: {', '.join(group_classes)}")


Group 1: 2 - two, 3 - three
Group 2: 5 - five, 6 - six
Group 3: 8 - eight, 9 - nine
Group 4: 0 - zero, 4 - four
Group 5: 7 - seven, 1 - one


In [None]:
final_groups

{0: ['2 - two', '3 - three'],
 1: ['5 - five', '6 - six'],
 2: ['8 - eight', '9 - nine'],
 3: ['0 - zero', '4 - four'],
 4: ['7 - seven', '1 - one']}

In [None]:

task_similarity = [[0 for _ in range(num_task)] for _ in range(num_task)]

for i in range(num_task):
    for j in range(num_task):
        if i == j:
            continue  # Skip comparing the task with itself
        # Calculate average distance between classes in task i and task j
        total_distance = 0
        count = 0
        for class1 in final_groups[i]:
            for class2 in final_groups[j]:
                key = f"{class1} vs {class2}" if f"{class1} vs {class2}" in re else f"{class2} vs {class1}"
                total_distance += re[key]
                count += 1
        average_distance = total_distance / count
        task_similarity[i][j] = average_distance


In [None]:
task_similarity

[[0,
  tensor([0.1939], device='cuda:0'),
  tensor([0.2292], device='cuda:0'),
  tensor([0.2417], device='cuda:0'),
  tensor([0.2723], device='cuda:0')],
 [tensor([0.1939], device='cuda:0'),
  0,
  tensor([0.1984], device='cuda:0'),
  tensor([0.2361], device='cuda:0'),
  tensor([0.2611], device='cuda:0')],
 [tensor([0.2292], device='cuda:0'),
  tensor([0.1984], device='cuda:0'),
  0,
  tensor([0.2437], device='cuda:0'),
  tensor([0.2730], device='cuda:0')],
 [tensor([0.2417], device='cuda:0'),
  tensor([0.2361], device='cuda:0'),
  tensor([0.2437], device='cuda:0'),
  0,
  tensor([0.2822], device='cuda:0')],
 [tensor([0.2723], device='cuda:0'),
  tensor([0.2611], device='cuda:0'),
  tensor([0.2730], device='cuda:0'),
  tensor([0.2822], device='cuda:0'),
  0]]

In [None]:
import numpy as np

def greedy_nearest_neighbors(task_similarity):
    """
    A greedy approach to sequencing tasks based on minimizing the
    distribution change between consecutive tasks.
    """
    num_tasks = len(task_similarity)
    visited = [False] * num_tasks
    current_task = 0
    task_order = [current_task]
    visited[current_task] = True

    while len(task_order) < num_tasks:
        # Find the nearest unvisited task to the current task
        next_task = None
        min_change = float('inf')
        for i in range(num_tasks):
            if not visited[i] and task_similarity[current_task][i] < min_change:
                next_task = i
                min_change = task_similarity[current_task][i]
        # Update the current task, mark it as visited, and add it to the task order
        visited[next_task] = True
        task_order.append(next_task)
        current_task = next_task

    return task_order


In [None]:

task_order = greedy_nearest_neighbors(task_similarity)

print("Suggested task order to minimize distribution changes:", task_order)


Suggested task order to minimize distribution changes: [0, 1, 2, 3, 4]


## dissimilarity

In [None]:
import networkx as nx
# Sample data: distances between class pairs as provided
distances = {
    '0 - zero vs 1 - one': 0.3298,
    '0 - zero vs 2 - two': 0.2269,
    '0 - zero vs 3 - three': 0.2288,
    '0 - zero vs 4 - four': 0.3223,
    '0 - zero vs 5 - five': 0.2327,
    '0 - zero vs 6 - six': 0.2089,
    '0 - zero vs 7 - seven': 0.2764,
    '0 - zero vs 8 - eight': 0.2260,
    '0 - zero vs 9 - nine': 0.2293,
    '1 - one vs 2 - two': 0.2964,
    '1 - one vs 3 - three': 0.3195,
    '1 - one vs 4 - four': 0.2813,
    '1 - one vs 5 - five': 0.2837,
    '1 - one vs 6 - six': 0.2917,
    '1 - one vs 7 - seven': 0.2585,
    '1 - one vs 8 - eight': 0.3159,
    '1 - one vs 9 - nine': 0.2939,
    '2 - two vs 3 - three': 0.1842,
    '2 - two vs 4 - four': 0.2436,
    '2 - two vs 5 - five': 0.1827,
    '2 - two vs 6 - six': 0.2079,
    '2 - two vs 7 - seven': 0.2281,
    '2 - two vs 8 - eight': 0.2365,
    '2 - two vs 9 - nine': 0.2189,
    '3 - three vs 4 - four': 0.2673,
    '3 - three vs 5 - five': 0.1720,
    '3 - three vs 6 - six': 0.2132,
    '3 - three vs 7 - seven': 0.2452,
    '3 - three vs 8 - eight': 0.2367,
    '3 - three vs 9 - nine': 0.2247,
    '4 - four vs 5 - five': 0.2431,
    '4 - four vs 6 - six': 0.2599,
    '4 - four vs 7 - seven': 0.2415,
    '4 - four vs 8 - eight': 0.2609,
    '4 - four vs 9 - nine': 0.2586,
    '5 - five vs 6 - six': 0.1834,
    '5 - five vs 7 - seven': 0.2235,
    '5 - five vs 8 - eight': 0.2105,
    '5 - five vs 9 - nine': 0.2018,
    '6 - six vs 7 - seven': 0.2458,
    '6 - six vs 8 - eight': 0.2027,
    '6 - six vs 9 - nine': 0.1786,
    '7 - seven vs 8 - eight': 0.2602,
    '7 - seven vs 9 - nine': 0.2221,
    '8 - eight vs 9 - nine': 0.1819
}

# Create a complete graph
G = nx.Graph()

# Add edges between classes with their dissimilarities as weights
for pair, distance in distances.items():
    class1, class2 = pair.split(' vs ')

    # Add edge between the classes with distance as weight
    G.add_edge(class1, class2, weight=distance)

# Find the maximum weight matching in the graph
max_matching = nx.max_weight_matching(G, maxcardinality=True, weight='weight')

# Print out the matched pairs (i.e., the tasks)
print("Tasks with maximized dissimilarities:")
for task in max_matching:
    print(f"Task: {task}")


Tasks with maximized dissimilarities:
Task: ('8 - eight', '2 - two')
Task: ('4 - four', '0 - zero')
Task: ('3 - three', '1 - one')
Task: ('9 - nine', '5 - five')
Task: ('6 - six', '7 - seven')


In [None]:
final_groups = {0: ['8 - eight', '2 - two'],
 1: ['4 - four', '0 - zero'],
 2: ['3 - three', '1 - one'],
 3: ['9 - nine', '5 - five'],
 4: ['6 - six', '7 - seven']}

In [None]:

num_task = 5
task_similarity = [[0 for _ in range(num_task)] for _ in range(num_task)]
re = distances
for i in range(num_task):
    for j in range(num_task):
        if i == j:
            continue  # Skip comparing the task with itself
        # Calculate average distance between classes in task i and task j
        total_distance = 0
        count = 0
        for class1 in final_groups[i]:
            for class2 in final_groups[j]:
                key = f"{class1} vs {class2}" if f"{class1} vs {class2}" in re else f"{class2} vs {class1}"
                total_distance += re[key]
                count += 1
        average_distance = total_distance / count
        task_similarity[i][j] = average_distance


In [None]:
task_similarity

[[0, 0.23935, 0.2583, 0.19849999999999998, 0.22472499999999998],
 [0.23935, 0, 0.2768, 0.24092500000000003, 0.246675],
 [0.2583, 0.2768, 0, 0.24357499999999999, 0.25215],
 [0.1985, 0.240925, 0.24357499999999999, 0, 0.20190000000000002],
 [0.22472499999999998, 0.24667499999999998, 0.25215, 0.2019, 0]]

In [None]:
# Example usage
task_order = greedy_nearest_neighbors(task_similarity)

print("Suggested task order to minimize distribution changes:", task_order)


Suggested task order to minimize distribution changes: [0, 3, 4, 1, 2]
