Check if Data is ready for Training

### Classes in MVTEC_AD dataset

In [2]:
from pathlib import Path

root = Path("/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/raw/MVTEC_AD/mvtec_anomaly_detection")
classes = set()
for category in root.iterdir():
    if not category.is_dir():
        continue
    test_dir = category / "test"
    if test_dir.exists():
        for defect_type in test_dir.iterdir():
            if defect_type.is_dir() and defect_type.name != "good":
                classes.add(defect_type.name)

print(f"Total defect classes: {len(classes)}")
print(sorted(classes))


Total defect classes: 48
['bent', 'bent_lead', 'bent_wire', 'broken', 'broken_large', 'broken_small', 'broken_teeth', 'cable_swap', 'color', 'combined', 'contamination', 'crack', 'cut', 'cut_inner_insulation', 'cut_lead', 'cut_outer_insulation', 'damaged_case', 'defective', 'fabric_border', 'fabric_interior', 'faulty_imprint', 'flip', 'fold', 'glue', 'glue_strip', 'gray_stroke', 'hole', 'liquid', 'manipulated_front', 'metal_contamination', 'misplaced', 'missing_cable', 'missing_wire', 'oil', 'pill_type', 'poke', 'poke_insulation', 'print', 'rough', 'scratch', 'scratch_head', 'scratch_neck', 'split_teeth', 'squeeze', 'squeezed_teeth', 'thread', 'thread_side', 'thread_top']


# Koklektor Dataset

In [3]:
from pathlib import Path

root = Path("/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/raw/KolektorSDD/KolektorSDD2")
train_dir = root / "train"
test_dir = root / "test"

def count_defects(folder):
    defects = 0
    total = 0
    for img in folder.glob("*.png"):
        if img.name.endswith("_GT.png"):
            continue
        total += 1
        mask = folder / f"{img.stem}_GT.png"
        if mask.exists():
            defects += 1
    return total, defects

train_total, train_defects = count_defects(train_dir)
test_total, test_defects = count_defects(test_dir)

print(f"TRAIN: {train_defects}/{train_total} defective")
print(f"TEST: {test_defects}/{test_total} defective")


TRAIN: 2331/2333 defective
TEST: 1004/1004 defective


In [1]:
import os
import shutil
from pathlib import Path
import pandas as pd

# ==============================
# CONFIGURATION
# ==============================
RAW_DATASET_DIR = Path("/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/raw/MVTEC_AD/mvtec_anomaly_detection")  # <-- change this
OUTPUT_DIR = Path("/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed")

# Create output structure
(OUTPUT_DIR / "images/train").mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / "images/val").mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / "masks/val").mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / "metadata").mkdir(parents=True, exist_ok=True)

metadata = []

# ==============================
# ITERATE OVER ALL CATEGORIES
# ==============================
for category in sorted(os.listdir(RAW_DATASET_DIR)):
    category_path = RAW_DATASET_DIR / category
    if not category_path.is_dir():
        continue

    print(f"Processing category: {category}")

    # --- TRAIN (GOOD IMAGES) ---
    train_good_dir = category_path / "train" / "good"
    if train_good_dir.exists():
        for img_file in sorted(train_good_dir.glob("*")):
            new_name = f"{category}_good_train_{img_file.name}"
            dest = OUTPUT_DIR / "images/train" / new_name
            shutil.copy(img_file, dest)
            metadata.append({
                "category": category,
                "subset": "train",
                "type": "good",
                "image": new_name,
                "mask": None
            })

    # --- TEST (GOOD + DEFECTIVE) ---
    test_dir = category_path / "test"
    gt_dir = category_path / "ground_truth"

    if test_dir.exists():
        for defect_type in sorted(os.listdir(test_dir)):
            defect_dir = test_dir / defect_type
            if not defect_dir.is_dir():
                continue

            for img_file in sorted(defect_dir.glob("*")):
                new_name = f"{category}_{defect_type}_{img_file.name}"
                dest = OUTPUT_DIR / "images/val" / new_name
                shutil.copy(img_file, dest)

                mask_file = None
                if defect_type != "good":  # Only defective images have masks
                    mask_dir = gt_dir / defect_type
                    if mask_dir.exists():
                        base_name = img_file.stem
                        possible_masks = list(mask_dir.glob(f"{base_name}*"))
                        if possible_masks:
                            mask_file = possible_masks[0]
                            new_mask_name = f"{category}_{defect_type}_{mask_file.name}"
                            mask_dest = OUTPUT_DIR / "masks/val" / new_mask_name
                            shutil.copy(mask_file, mask_dest)
                            mask_file = new_mask_name

                metadata.append({
                    "category": category,
                    "subset": "val",
                    "type": defect_type,
                    "image": new_name,
                    "mask": mask_file
                })

# ==============================
# SAVE METADATA
# ==============================
df = pd.DataFrame(metadata)
df.to_csv(OUTPUT_DIR / "metadata" / "mapping.csv", index=False)
print("\nOrganization complete!")
print(f"Images and masks saved under: {OUTPUT_DIR}")
print(f"Metadata CSV: {OUTPUT_DIR / 'metadata/mapping.csv'}")


Processing category: bottle
Processing category: cable
Processing category: capsule
Processing category: carpet
Processing category: grid
Processing category: hazelnut
Processing category: leather
Processing category: metal_nut
Processing category: pill
Processing category: screw
Processing category: tile
Processing category: toothbrush
Processing category: transistor
Processing category: wood
Processing category: zipper

Organization complete!
Images and masks saved under: /Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed
Metadata CSV: /Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed/metadata/mapping.csv


## Sanity Checks

**imports**

In [2]:
import os
import cv2
import random
import matplotlib.pyplot as plt

In [3]:
DATASET_PATH = "/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed"

**Verify every defective image has a corresponding mask**

In [6]:
defective_missing_masks = []

for category in categories:
    gt_path = os.path.join(DATASET_PATH, category, "ground_truth")
    test_path = os.path.join(DATASET_PATH, category, "test")
    if not os.path.exists(gt_path):
        continue

    for defect_type in os.listdir(gt_path):
        gt_files = sorted(os.listdir(os.path.join(gt_path, defect_type)))
        test_files = sorted(os.listdir(os.path.join(test_path, defect_type)))

        gt_basenames = [os.path.splitext(f)[0] for f in gt_files]
        test_basenames = [os.path.splitext(f)[0] for f in test_files]

        missing = [f for f in test_basenames if f not in gt_basenames]
        if missing:
            defective_missing_masks.extend([(category, defect_type, m) for m in missing])

if defective_missing_masks:
    print("\n⚠️ Missing masks for these images:")
    for item in defective_missing_masks[:10]:
        print(item)
else:
    print("\n All defective images have corresponding masks!")


 All defective images have corresponding masks!


## checking labels 

In [12]:
import os
import cv2
import random

# === PATHS ===
base_dir = "/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed"
images_dir = os.path.join(base_dir, "images/val")
labels_dir = os.path.join(base_dir, "labels/val")
output_dir = os.path.join(base_dir, "sanity_check_outputs")

os.makedirs(output_dir, exist_ok=True)

num_samples = 10 

# === FUNCTION TO DRAW BOXES ===
def draw_yolo_boxes(img_path, label_path, save_path):
    img = cv2.imread(img_path)
    if img is None:
        return False

    h, w = img.shape[:2]
    if not os.path.exists(label_path):
        return False

    with open(label_path, "r") as f:
        lines = f.readlines()

    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue
        cls, x, y, bw, bh = map(float, parts)
        x1 = int((x - bw / 2) * w)
        y1 = int((y - bh / 2) * h)
        x2 = int((x + bw / 2) * w)
        y2 = int((y + bh / 2) * h)
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
        cv2.putText(img, f"defect", (x1, y1 - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)

    cv2.imwrite(save_path, img)
    return True

# === RANDOMLY CHECK SOME LABELS ===
label_files = [f for f in os.listdir(labels_dir) if f.endswith(".txt")]
print(f"Found {len(label_files)} label files.")

sample_files = random.sample(label_files, min(num_samples, len(label_files)))

for lf in sample_files:
    label_path = os.path.join(labels_dir, lf)
    img_name = lf.replace(".txt", ".png")
    img_path = os.path.join(images_dir, img_name)
    if not os.path.exists(img_path):
        img_name = lf.replace(".txt", ".jpg")
        img_path = os.path.join(images_dir, img_name)
        if not os.path.exists(img_path):
            continue
    save_path = os.path.join(output_dir, img_name)
    success = draw_yolo_boxes(img_path, label_path, save_path)
    if success:
        print(f"✓ Visualized: {img_name}")
    else:
        print(f"⚠️ Skipped: {img_name}")

print(f"\nSanity check complete! Labeled samples saved to:\n{output_dir}")


Found 1258 label files.
✓ Visualized: carpet_color_003.png
✓ Visualized: tile_rough_006.png
✓ Visualized: wood_combined_005.png
✓ Visualized: zipper_fabric_border_014.png
✓ Visualized: bottle_broken_large_001.png
✓ Visualized: hazelnut_hole_008.png
✓ Visualized: wood_color_002.png
✓ Visualized: capsule_squeeze_008.png
✓ Visualized: zipper_fabric_interior_011.png
✓ Visualized: zipper_rough_016.png

Sanity check complete! Labeled samples saved to:
/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed/sanity_check_outputs


## Organising Kolektor Dataset

In [14]:
import os
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

base_dir = "/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data"
raw_dir = os.path.join(base_dir, "raw/KolektorSDD/KolektorSDD2")
processed_dir = os.path.join(base_dir, "processed")

images_dir = os.path.join(processed_dir, "images2")
masks_dir = os.path.join(processed_dir, "masks2")
os.makedirs(images_dir, exist_ok=True)
os.makedirs(masks_dir, exist_ok=True)

def copy_file(src, dst):
    try:
        shutil.copy2(src, dst)
    except Exception as e:
        print(f"Error copying {src}: {e}")

def organize_kolektor(raw_split, split_name):
    split_path = os.path.join(raw_dir, raw_split)
    img_out = os.path.join(images_dir, split_name)
    mask_out = os.path.join(masks_dir, split_name)
    os.makedirs(img_out, exist_ok=True)
    os.makedirs(mask_out, exist_ok=True)

    files = [f for f in os.listdir(split_path) if f.lower().endswith((".png", ".jpg"))]

    tasks = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        for f in tqdm(files, desc=f"Organizing {split_name}"):
            src = os.path.join(split_path, f)
            if "_gt" in f.lower():
                dst = os.path.join(mask_out, f.replace("_GT", "").replace("_gt", ""))
            else:
                dst = os.path.join(img_out, f)
            tasks.append(executor.submit(copy_file, src, dst))

        for task in as_completed(tasks):
            _ = task.result()

    print(f"{split_name} done — {len(files)} files processed.")

organize_kolektor("train", "train")
organize_kolektor("test", "val")


Organizing train: 100%|██████████| 4878/4878 [00:00<00:00, 93541.70it/s]


train done — 4878 files processed.


Organizing val: 100%|██████████| 2082/2082 [00:00<00:00, 161456.59it/s]


val done — 2082 files processed.


In [15]:
import os

print("train images:", len(os.listdir("/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed/images2/train")))
print("train masks:", len(os.listdir("/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed/masks2/train")))
print("val images:", len(os.listdir("/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed/images2/val")))
print("val masks:", len(os.listdir("/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed/masks2/val")))


train images: 2439
train masks: 2439
val images: 1041
val masks: 1041


In [1]:
import os
import cv2
import random

# === PATHS ===
base_dir = "/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed"
images_dir = os.path.join(base_dir, "images2/val")
labels_dir = os.path.join(base_dir, "labels2/val")
output_dir = os.path.join(base_dir, "sanity_check_outputs2")

os.makedirs(output_dir, exist_ok=True)

num_samples = 10 

# === FUNCTION TO DRAW BOXES ===
def draw_yolo_boxes(img_path, label_path, save_path):
    img = cv2.imread(img_path)
    if img is None:
        return False

    h, w = img.shape[:2]
    if not os.path.exists(label_path):
        return False

    with open(label_path, "r") as f:
        lines = f.readlines()

    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue
        cls, x, y, bw, bh = map(float, parts)
        x1 = int((x - bw / 2) * w)
        y1 = int((y - bh / 2) * h)
        x2 = int((x + bw / 2) * w)
        y2 = int((y + bh / 2) * h)
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
        cv2.putText(img, f"defect", (x1, y1 - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)

    cv2.imwrite(save_path, img)
    return True

# === RANDOMLY CHECK SOME LABELS ===
label_files = [f for f in os.listdir(labels_dir) if f.endswith(".txt")]
print(f"Found {len(label_files)} label files.")

sample_files = random.sample(label_files, min(num_samples, len(label_files)))

for lf in sample_files:
    label_path = os.path.join(labels_dir, lf)
    img_name = lf.replace(".txt", ".png")
    img_path = os.path.join(images_dir, img_name)
    if not os.path.exists(img_path):
        img_name = lf.replace(".txt", ".jpg")
        img_path = os.path.join(images_dir, img_name)
        if not os.path.exists(img_path):
            continue
    save_path = os.path.join(output_dir, img_name)
    success = draw_yolo_boxes(img_path, label_path, save_path)
    if success:
        print(f" Visualized: {img_name}")
    else:
        print(f"Skipped: {img_name}")

print(f"\nSanity check complete! Labeled samples saved to:\n{output_dir}")


Found 147 label files.
 Visualized: 20378_aug1769.png
 Visualized: 20821.png
 Visualized: 20669.png
 Visualized: 20172_aug8612.png
 Visualized: 20632_aug4385.png
 Visualized: 20682_aug6115.png
 Visualized: 20099.png
 Visualized: 20056.png
 Visualized: 20587.png
 Visualized: 20772.png

Sanity check complete! Labeled samples saved to:
/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/processed/sanity_check_outputs2


## NEU-DET

In [5]:
import os

base_dir = "/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/raw/NEU-DET"
splits = ["train", "validation"]

for split in splits:
    images_dir = os.path.join(base_dir, split, "images")
    labels_dir = os.path.join(base_dir, "labels", split)

    # Collect all image filenames (without extension), recursively through class subfolders
    image_files = []
    for root, _, files in os.walk(images_dir):
        for f in files:
            if f.lower().endswith(('.jpg', '.png')):
                image_files.append(os.path.splitext(f)[0])

    # Collect all label filenames (without extension)
    label_files = []
    if os.path.exists(labels_dir):
        for root, _, files in os.walk(labels_dir):
            for f in files:
                if f.endswith('.txt'):
                    label_files.append(os.path.splitext(f)[0])
    else:
        print(f" Labels folder does not exist for split: {split}")

    image_set = set(image_files)
    label_set = set(label_files)

    missing_labels = image_set - label_set
    extra_labels = label_set - image_set

    print(f"\n=== {split.upper()} SPLIT ===")
    print(f"Total images: {len(image_files)}")
    print(f"Total labels: {len(label_files)}")
    print(f"Images without labels: {len(missing_labels)}")
    print(f"Labels without images: {len(extra_labels)}")

    if missing_labels:
        print("Missing labels for images:", sorted(missing_labels))
    if extra_labels:
        print("Labels with no corresponding images:", sorted(extra_labels))



=== TRAIN SPLIT ===
Total images: 1440
Total labels: 1440
Images without labels: 0
Labels without images: 0

=== VALIDATION SPLIT ===
Total images: 360
Total labels: 360
Images without labels: 0
Labels without images: 0


In [8]:
import os
import cv2
import random

# === PATHS ===
base_dir = "/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/raw/NEU-DET"
splits = ["train", "validation"]
output_dir = os.path.join(base_dir, "bbox_visuals")
os.makedirs(output_dir, exist_ok=True)

num_samples = 5  # images per split

# === FUNCTION TO DRAW YOLO BOXES ===
def draw_yolo_boxes(img_path, label_path, save_path):
    img = cv2.imread(img_path)
    if img is None:
        return False
    h, w = img.shape[:2]
    if not os.path.exists(label_path):
        return False
    with open(label_path, "r") as f:
        lines = f.readlines()
    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue
        cls_id, x, y, bw, bh = map(float, parts)
        x1 = int((x - bw / 2) * w)
        y1 = int((y - bh / 2) * h)
        x2 = int((x + bw / 2) * w)
        y2 = int((y + bh / 2) * h)
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
        cv2.putText(img, str(int(cls_id)), (x1, y1 - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
    cv2.imwrite(save_path, img)
    return True

# === HELPER FUNCTION TO FIND IMAGE RECURSIVELY ===
def find_image(image_name, search_dir):
    for root, _, files in os.walk(search_dir):
        for file in files:
            if file.startswith(image_name):
                return os.path.join(root, file)
    return None

# === VISUALIZE RANDOM SAMPLES ===
for split in splits:
    images_dir = os.path.join(base_dir, split)
    labels_dir = os.path.join(base_dir, "labels", split)
    split_output_dir = os.path.join(output_dir, split)
    os.makedirs(split_output_dir, exist_ok=True)

    label_files = [f for f in os.listdir(labels_dir) if f.endswith(".txt")]
    sample_files = random.sample(label_files, min(num_samples, len(label_files)))

    for lf in sample_files:
        label_path = os.path.join(labels_dir, lf)
        img_name = lf.replace(".txt", "")

        img_path = find_image(img_name, images_dir)
        if img_path is None:
            print(f"Image not found for label: {lf}")
            continue

        save_path = os.path.join(split_output_dir, os.path.basename(img_path))
        success = draw_yolo_boxes(img_path, label_path, save_path)
        if success:
            print(f" Visualized: {os.path.basename(img_path)}")
        else:
            print(f"Skipped: {os.path.basename(img_path)}")

print(f"\nBounding box visualization complete! Check the folder:\n{output_dir}")


 Visualized: pitted_surface_218.jpg
 Visualized: inclusion_146.jpg
 Visualized: pitted_surface_207.jpg
 Visualized: scratches_66.jpg
 Visualized: inclusion_107.jpg
 Visualized: pitted_surface_259.jpg
 Visualized: pitted_surface_272.jpg
 Visualized: scratches_255.jpg
 Visualized: pitted_surface_257.jpg
 Visualized: patches_281.jpg

Bounding box visualization complete! Check the folder:
/Users/ajayyy/Desktop/Deep_Learning/Smart-Quality-Inspection-System/data/raw/NEU-DET/bbox_visuals
