In [None]:
import os
from os import listdir, walk
from os.path import isfile, join
import cv2
import numpy as np

work_Directory = os.getcwd()


# Obtaining the name of the images in the dataset folder

In [None]:
os.chdir(work_Directory)
print(os.getcwd())

try:
    os.chdir(f'{work_Directory}/dataset/dataverse_files-2/ZT76_39_A')
except Exception as e:
    print('Encounter an error: ', e)

f = []
for (dirpath, dirnames, filenames) in walk(os.getcwd()):
    f.extend(filenames)
    break
print(f'The list of files in the dataset folder is {f}')
print(f'The number of images in the folder "ZT76_39_A" is {len(f)}')



In [None]:
print(dirpath)
print(dirnames)
print(filenames)

# Another way to read the images and store them into a list with their name

In [None]:
import os, cv2

def load_images_from_folder(folder_path):
    """
    Loads images from a given folder, returning a list of tuples (filename, image).
    """
    images = []
    
    # Iterate through the folder
    for filename in os.listdir(folder_path):
        # Build the full path to the current file
        file_path = os.path.join(folder_path, filename)
        
        # Check if it's an image file (you can extend checks for more formats)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            # Read the image using OpenCV
            img = cv2.imread(file_path)
            if img is not None:
                images.append((filename, img))
            else:
                print(f"Warning: Could not read {file_path}")
    
    return images

# ----------------- Load the dataset -----------------
image_folder_path = 'dataset/dataverse_files-2/ZT76_39_A'
images_with_names = load_images_from_folder(image_folder_path)

# Each element in images_with_names is a tuple: (filename, image)
for img_tuple in images_with_names:
    print(f"Filename: {img_tuple[0]}, Image shape: {img_tuple[1].shape}")

# Reading the converted dataset to see what is the distribution of the labels

In [None]:
import torch

dataset  = torch.load('dataset/ConvertedDataset/20250326_GraphDatasetWithJabels.pt')
for i in range(10):
    print(dataset[i]['y'])

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np


labels = [label['y'] for label in dataset]
labels = np.array(labels)
print(labels)

# 1. Convert each multi-hot vector to a tuple (immutable) so it can be hashed easily
tuple_labels = [tuple(label) for label in labels]
print(tuple_labels)

# 2. Count occurrences of each unique label combination
counts = Counter(tuple_labels)

# 3. Prepare data for plotting
# We'll convert the keys (tuples) to strings for a nicer x-axis
label_combinations = [str(k) for k in counts.keys()]
occurrences = [counts[k] for k in counts.keys()]

# 4. Plot the distribution
plt.figure(figsize=(8, 4))
bars = plt.bar(label_combinations, occurrences, color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Label Combination(white, green, blue, yellow, red)')
plt.ylabel('Count')
plt.title('Distribution of Multi-Label Combinations')
plt.tight_layout()

# 5. Add the count values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width()/2, 
        height, 
        str(height),
        ha='center', 
        va='bottom', 
        fontsize=10
    )

plt.show()


In [None]:
dataset[0]

## Comapre the dataset labels for each label.
## per-label frequency distribution


In [None]:
# Example label names
label_names = ["white", "green", "blue", "yellow", "red"]


# Convert to NumPy array for easy summation
arr = np.array(labels)

# Summation across rows gives the count of how many samples have each label
per_label_counts = arr.sum(axis=0)

# Plot the per-label frequencies
plt.figure(figsize=(6,4))

plt.bar(label_names, per_label_counts, color='skyblue')
plt.xlabel("Labels")
plt.ylabel("Frequency")
plt.title("Per-Label Frequency in Multi-Label Dataset")

# (Optional) Annotate counts on top of bars
for i, count in enumerate(per_label_counts):
    plt.text(i, count+0.1, str(int(count)), ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print out raw counts if you want them in code
for label, count in zip(label_names, per_label_counts):
    print(f"{label}: {count}")