anomaly_dataset/normal/      # Only forest images

anomaly_dataset/anomaly/     # All other categories


In [3]:
import os
import shutil
from pathlib import Path
import random

# Set base directory of the dataset
base_dir = "archive/seg_test/seg_test"

# Define classes
normal_class = "forest"
anomaly_classes = ["buildings", "glacier", "mountain", "sea", "street"]

# Output directories
output_dir = "anomaly_dataset"
normal_dir = os.path.join(output_dir, "normal")
anomaly_dir = os.path.join(output_dir, "anomaly")

# Create output folders
os.makedirs(normal_dir, exist_ok=True)
os.makedirs(anomaly_dir, exist_ok=True)

# Copy normal class images
for img_name in os.listdir(os.path.join(base_dir, normal_class)):
    src = os.path.join(base_dir, normal_class, img_name)
    dst = os.path.join(normal_dir, img_name)
    shutil.copy(src, dst)

# Copy anomaly class images
for cls in anomaly_classes:
    class_path = os.path.join(base_dir, cls)
    for img_name in os.listdir(class_path):
        src = os.path.join(class_path, img_name)
        dst = os.path.join(anomaly_dir, f"{cls}_{img_name}")
        shutil.copy(src, dst)

print("Anomaly dataset created!")


Anomaly dataset created!


In [6]:
import os
import random
import shutil

# Parameters
data_dir = "anomaly_dataset"
output_dir = "split_anomaly_dataset"
c = 0.1  # Proportion of anomalies in each set

# Read and shuffle image file paths
normal_images = [os.path.join(data_dir, "normal", img) for img in os.listdir(os.path.join(data_dir, "normal"))]
anomaly_images = [os.path.join(data_dir, "anomaly", img) for img in os.listdir(os.path.join(data_dir, "anomaly"))]

random.shuffle(normal_images)
random.shuffle(anomaly_images)

# Decide how many images go into each split
total_samples_per_split = min(len(normal_images) + len(anomaly_images), 20000) // 2
num_anomalies = int(c * total_samples_per_split)
num_normals = total_samples_per_split - num_anomalies

# Ensure we don't request more images than we have
num_normals = min(num_normals, len(normal_images) // 2)
num_anomalies = min(num_anomalies, len(anomaly_images) // 2)

# Now build splits
train_normal = normal_images[:num_normals]
test_normal = normal_images[num_normals:num_normals*2]

train_anomaly = anomaly_images[:num_anomalies]
test_anomaly = anomaly_images[num_anomalies:num_anomalies*2]

# Helper to copy files
def copy_files(file_list, target_dir):
    os.makedirs(target_dir, exist_ok=True)
    for f in file_list:
        shutil.copy(f, os.path.join(target_dir, os.path.basename(f)))

# Copy all files to split folders
copy_files(train_normal, os.path.join(output_dir, "train", "normal"))
copy_files(train_anomaly, os.path.join(output_dir, "train", "anomaly"))
copy_files(test_normal, os.path.join(output_dir, "test", "normal"))
copy_files(test_anomaly, os.path.join(output_dir, "test", "anomaly"))

print(f"Train/Test split complete.")
print(f"Train -> Normal: {len(train_normal)}, Anomaly: {len(train_anomaly)}")
print(f"Test  -> Normal: {len(test_normal)}, Anomaly: {len(test_anomaly)}")


Train/Test split complete.
Train -> Normal: 237, Anomaly: 150
Test  -> Normal: 237, Anomaly: 150
