In [None]:
import numpy as np
import skimage
import skimage.data
import skimage.io
import os
import random
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
from matplotlib.patches import Patch

# 1. Read Dataset

In [None]:
base = "dataset"
paths = {
    "label": os.path.join(base, "label_images_semantic"),
    "orig": os.path.join(base, "original_images"),
    "rgb":  os.path.join(base, "RGB_color_image_masks")
}

orig_files = sorted([f for f in os.listdir(paths["orig"]) if f.endswith(".jpg")])
df = pd.read_csv(os.path.join(base, "class_dict_seg.csv"))
color_items = [(row["name"], (int(row["r"]), int(row["g"]), int(row["b"]))) for i, row in df.iterrows()]


In [None]:
n_total = len(orig_files)

### 1.1. Index checks 

In [None]:
for name, path in paths.items():
    count = len([f for f in os.listdir(path) if f.lower().endswith((".jpg",".png"))])
    print(f"{name} images: {count}")

In [None]:
indices = [int(os.path.splitext(f)[0]) for f in orig_files]

print("Lowest index:", min(indices))
print("Highest index:", max(indices))

In [None]:
expected = set(range(min(indices), max(indices)+1))
actual = set(indices)

missing = sorted(list(expected - actual))

if missing:
    print("Missing indices:", missing)
else:
    print("No missing indices, numbering is continuous.")

In [None]:
print(f"Da eseguire solo una volta per mettere a posto il nome delle immagini siccome alcune sono mancanti")

# # Rename all files with a "t_" prefix to avoid name conflicts
# for new_idx, f_jpg in enumerate(orig_files):
#     base_name = os.path.splitext(f_jpg)[0]
#     f_png = base_name + ".png"

#     temp_name_jpg = f"t_{new_idx:03d}.jpg"
#     temp_name_png = f"t_{new_idx:03d}.png"

#     # Rename original image
#     os.rename(os.path.join(paths["orig"], f_jpg),
#               os.path.join(paths["orig"], temp_name_jpg))
#     # Rename label image
#     os.rename(os.path.join(paths["label"], f_png),
#               os.path.join(paths["label"], temp_name_png))
#     # Rename RGB mask
#     os.rename(os.path.join(paths["rgb"], f_png),
#               os.path.join(paths["rgb"], temp_name_png))

# # Now remove the "t_" prefix to set new numbering
# for f in os.listdir(paths["orig"]):
#     if f.startswith("t_"):
#         new_name = f.replace("t_", "")
#         os.rename(os.path.join(paths["orig"], f),
#                   os.path.join(paths["orig"], new_name))

# for f in os.listdir(paths["label"]):
#     if f.startswith("t_"):
#         new_name = f.replace("t_", "")
#         os.rename(os.path.join(paths["label"], f),
#                   os.path.join(paths["label"], new_name))

# for f in os.listdir(paths["rgb"]):
#     if f.startswith("t_"):
#         new_name = f.replace("t_", "")
#         os.rename(os.path.join(paths["rgb"], f),
#                   os.path.join(paths["rgb"], new_name))

In [None]:
orig_files = sorted([f for f in os.listdir(paths["orig"]) if f.endswith(".jpg")])

### 1.2. Image visualizations

- Visualize some images and corresponding segmentation masks.

In [None]:
idxs = random.sample(range(len(orig_files)), 10)

for i in idxs:
    
    f_jpg = orig_files[i]
    f_png = os.path.splitext(f_jpg)[0] + ".png"

    img_orig = Image.open(os.path.join(paths["orig"], f_jpg))
    img_label = Image.open(os.path.join(paths["label"], f_png))
    img_rgb = Image.open(os.path.join(paths["rgb"], f_png))

    plt.figure(figsize=(9,3))
    plt.suptitle(f"Index {i}")
    plt.subplot(1,3,1); plt.imshow(img_orig); plt.title("Original"); plt.axis('off')
    plt.subplot(1,3,2); plt.imshow(img_label); plt.title("Label"); plt.axis('off')
    plt.subplot(1,3,3); plt.imshow(img_rgb); plt.title("RGB Mask"); plt.axis('off')
    plt.show()


In [None]:
idxs = random.sample(range(len(orig_files)), 3)

for i in idxs:
    f_jpg = orig_files[i]
    f_png = os.path.splitext(f_jpg)[0] + ".png"

    img_orig = Image.open(os.path.join(paths["orig"], f_jpg))
    img_rgb = Image.open(os.path.join(paths["rgb"], f_png))
    arr = np.array(img_rgb).reshape(-1,3)
    unique_colors = {tuple(c) for c in arr}

    present = [(name,(r,g,b)) for name,(r,g,b) in color_items if (r,g,b) in unique_colors]

    plt.figure(figsize=(11,4))
    plt.suptitle(f"Index {i}")
    plt.subplot(1,2,1); plt.imshow(img_orig); plt.title("Original"); plt.axis('off')
    plt.subplot(1,2,2); plt.imshow(img_rgb); plt.title("RGB Mask"); plt.axis('off')

    handles = [Patch(facecolor=(r/255,g/255,b/255), label=name) for name,(r,g,b) in present]
    plt.legend(handles=handles, bbox_to_anchor=(1.02,1), loc='upper left', fontsize='small')
    plt.show()

# 2. Split train/test/valid

- Decide on a meaningful split between training/validation/testing data.
- Initially, you can “cheat” by setting validation==testing.

In [None]:
random.seed(42)
random.shuffle(orig_files)

In [None]:
train_end = int(0.7 * n_total)   
val_end = int(0.85 * n_total) # next 15% validation

# Split the files
train_files = orig_files[:train_end]
val_files = orig_files[train_end:val_end]
test_files = orig_files[val_end:]

In [None]:
# Save splits to CSV

base_path = "dataset"
pd.Series(train_files).to_csv(os.path.join(base_path, "train_files.csv"), index=False)
pd.Series(val_files).to_csv(os.path.join(base_path, "val_files.csv"), index=False)
pd.Series(test_files).to_csv(os.path.join(base_path, "test_files.csv"), index=False)

In [None]:
print(f"Total images: {n_total}")
print(f"Training: {len(train_files)}, Validation: {len(val_files)}, Test: {len(test_files)}")

### 3. Generate binary mask

sefsefsf
