To Make CSV File and check for duplicates

In [7]:
import os
import hashlib
import pandas as pd
from PIL import Image
from tqdm import tqdm
from collections import defaultdict


# =========================================================
# >>>>>>>>>>>> ADD YOUR DATASET PATHS HERE <<<<<<<<<<<<<<<
# =========================================================
OK_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Renamed_OK"
NOT_OK_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Renamed_Not_OK"

OUTPUT_CSV = "train.csv"
# =========================================================


def is_image_corrupt(image_path):
    """
    Returns True if image is corrupt, else False
    """
    try:
        with Image.open(image_path) as img:
            img.verify()
        return False
    except Exception:
        return True


def compute_image_hash(image_path):
    """
    Computes SHA256 hash of image file for duplicate detection
    """
    sha256 = hashlib.sha256()
    with open(image_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            sha256.update(chunk)
    return sha256.hexdigest()


def process_folder(folder_path, label, seen_hashes, records, stats):
    image_files = os.listdir(folder_path)

    for img_name in tqdm(image_files, desc=f"Processing {os.path.basename(folder_path)}"):
        img_path = os.path.join(folder_path, img_name)

        if not os.path.isfile(img_path):
            continue

        stats["total"] += 1
        stats["per_class"][label]["total"] += 1

        # 1. Corrupt image check
        if is_image_corrupt(img_path):
            stats["corrupt"] += 1
            stats["per_class"][label]["corrupt"] += 1
            continue

        # 2. Duplicate image check
        img_hash = compute_image_hash(img_path)
        if img_hash in seen_hashes:
            stats["duplicate"] += 1
            stats["per_class"][label]["duplicate"] += 1
            continue

        seen_hashes.add(img_hash)

        # 3. Store clean record
        image_id = os.path.splitext(img_name)[0]
        records.append({
            "ID": image_id,
            "label": label
        })

        stats["clean"] += 1
        stats["per_class"][label]["clean"] += 1




def main():
    seen_hashes = set()
    records = []

    stats = {
        "total": 0,
        "corrupt": 0,
        "duplicate": 0,
        "clean": 0,
        "per_class": {
            1: defaultdict(int),  # OK
            0: defaultdict(int)   # NOT OK
        }
    }

    process_folder(OK_DIR, label=1, seen_hashes=seen_hashes, records=records, stats=stats)
    process_folder(NOT_OK_DIR, label=0, seen_hashes=seen_hashes, records=records, stats=stats)

    df = pd.DataFrame(records)
    df.drop_duplicates(subset="ID", inplace=True)
    df.to_csv(OUTPUT_CSV, index=False)

    # ================== SUMMARY REPORT ==================
    print("\n================ DATA CLEANING REPORT ================")
    print(f"Total images scanned        : {stats['total']}")
    print(f"Corrupt images removed      : {stats['corrupt']}")
    print(f"Duplicate images removed    : {stats['duplicate']}")
    print(f"Clean images used           : {len(df)}")

    print("\n--- Per Class Breakdown ---")
    print(f"OK Images:")
    print(f"  Total     : {stats['per_class'][1]['total']}")
    print(f"  Corrupt   : {stats['per_class'][1]['corrupt']}")
    print(f"  Duplicate : {stats['per_class'][1]['duplicate']}")
    print(f"  Clean     : {stats['per_class'][1]['clean']}")

    print(f"\nNOT OK Images:")
    print(f"  Total     : {stats['per_class'][0]['total']}")
    print(f"  Corrupt   : {stats['per_class'][0]['corrupt']}")
    print(f"  Duplicate : {stats['per_class'][0]['duplicate']}")
    print(f"  Clean     : {stats['per_class'][0]['clean']}")

    print(f"\nCSV saved as: {OUTPUT_CSV}")
    print("======================================================")


if __name__ == "__main__":
    main()



Processing Renamed_OK: 100%|██████████| 1002/1002 [00:00<00:00, 2397.77it/s]
Processing Renamed_Not_OK: 100%|██████████| 4699/4699 [00:01<00:00, 2810.81it/s]


Total images scanned        : 5701
Corrupt images removed      : 0
Duplicate images removed    : 0
Clean images used           : 5701

--- Per Class Breakdown ---
OK Images:
  Total     : 1002
  Corrupt   : 0
  Duplicate : 0
  Clean     : 1002

NOT OK Images:
  Total     : 4699
  Corrupt   : 0
  Duplicate : 0
  Clean     : 4699

CSV saved as: train.csv





Image resized to 256x256

In [13]:
import os
from PIL import Image
from tqdm import tqdm

# ================= ADD PATHS HERE =================
OK_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Renamed_Ok"
NOT_OK_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Renamed_Not_OK"

OUTPUT_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Combined_Resized_256"
TARGET_SIZE = (256, 256)
# ==================================================

os.makedirs(OUTPUT_DIR, exist_ok=True)

VALID_EXTENSIONS = (".jpg", ".jpeg", ".png")


def resize_and_save(folder_path, desc_name):
    image_files = [
        f for f in os.listdir(folder_path)
        if f.lower().endswith(VALID_EXTENSIONS)
    ]

    for img_name in tqdm(image_files, desc=f"Resizing {desc_name} images"):
        src_path = os.path.join(folder_path, img_name)
        dst_path = os.path.join(OUTPUT_DIR, img_name)  # SAME NAME, NO PREFIX

        try:
            with Image.open(src_path) as img:
                img = img.convert("RGB")
                img_resized = img.resize(TARGET_SIZE, Image.BILINEAR)
                img_resized.save(dst_path)
        except Exception:
            continue


# Resize OK images
resize_and_save(OK_DIR, desc_name="OK")

# Resize NOT OK images
resize_and_save(NOT_OK_DIR, desc_name="NOT OK")

print("\nAll images resized to 256x256 and saved in:")
print(OUTPUT_DIR)


Resizing OK images: 100%|██████████| 1002/1002 [00:04<00:00, 204.52it/s]
Resizing NOT OK images: 100%|██████████| 4699/4699 [00:26<00:00, 176.04it/s]


All images resized to 256x256 and saved in:
C:\Users\maila\Desktop\Defect_Detection\Combined_Resized_256





Normalizing images

In [2]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm

# ================= ADD PATHS HERE =================
IMAGE_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Combined_Resized_256"
OUTPUT_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Normalised_Image_256"
# ==================================================

os.makedirs(OUTPUT_DIR, exist_ok=True)

VALID_EXTENSIONS = (".jpg", ".jpeg", ".png")

# Collect all image files
image_files = [
    f for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith(VALID_EXTENSIONS)
]

if len(image_files) == 0:
    raise ValueError("No images found in the folder.")

for img_name in tqdm(image_files, desc="Normalizing images"):
    img_path = os.path.join(IMAGE_DIR, img_name)
    save_path = os.path.join(OUTPUT_DIR, img_name)

    try:
        with Image.open(img_path) as img:
            img = img.convert("RGB")
            img_np = np.asarray(img).astype(np.float32)

            # Min-Max Normalization
            min_val = img_np.min()
            max_val = img_np.max()

            if max_val > min_val:
                img_norm = (img_np - min_val) / (max_val - min_val)
            else:
                img_norm = img_np  # edge case: constant image

            # Convert back to uint8 before saving
            img_norm = (img_norm * 255).astype(np.uint8)

            Image.fromarray(img_norm).save(save_path)

    except Exception:
        # Skip unreadable or corrupt images safely
        continue

print("\nAll images normalized and saved to:")
print(OUTPUT_DIR)


Normalizing images: 100%|██████████| 5701/5701 [01:47<00:00, 53.16it/s]


All images normalized and saved to:
C:\Users\maila\Desktop\Defect_Detection\Normalised_Image_256





Standardizing images

In [3]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm

# ================= ADD PATHS HERE =================
IMAGE_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Combined_Resized_256"
OUTPUT_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Standardized_Image_256"
# ==================================================

os.makedirs(OUTPUT_DIR, exist_ok=True)

VALID_EXTENSIONS = (".jpg", ".jpeg", ".png")

image_files = [
    f for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith(VALID_EXTENSIONS)
]

if len(image_files) == 0:
    raise ValueError("No images found in the folder.")

for img_name in tqdm(image_files, desc="Standardizing images"):
    img_path = os.path.join(IMAGE_DIR, img_name)
    save_path = os.path.join(OUTPUT_DIR, img_name)

    try:
        with Image.open(img_path) as img:
            img = img.convert("RGB")
            img_np = np.asarray(img).astype(np.float32)

            mean = img_np.mean()
            std = img_np.std()

            if std > 0:
                img_std = (img_np - mean) / std
            else:
                img_std = img_np  # edge case

            # Re-scale standardized image for saving (visualization only)
            img_std_min = img_std.min()
            img_std_max = img_std.max()

            img_std = (img_std - img_std_min) / (img_std_max - img_std_min)
            img_std = (img_std * 255).astype(np.uint8)

            Image.fromarray(img_std).save(save_path)

    except Exception:
        continue

print("\nAll images standardized and saved to:")
print(OUTPUT_DIR)


Standardizing images: 100%|██████████| 5701/5701 [00:21<00:00, 265.45it/s]


All images standardized and saved to:
C:\Users\maila\Desktop\Defect_Detection\Standardized_Image_256





In [5]:
import os

# ============== ADD PATH HERE ==============
IMAGE_DIR = r"C:\Users\maila\Desktop\Defect_Detection\Normalised_Image_256"
# ===========================================

VALID_EXTENSIONS = (".jpg", ".jpeg", ".png")

image_count = sum(
    1 for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith(VALID_EXTENSIONS)
)

print("Number of images in folder:", image_count)


Number of images in folder: 5701
