In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# -----------------------------
# PATHS
# -----------------------------
RAW_DIR = "../dataset/raw/vehicle_damage_dataset"
PROCESSED_DIR = "../dataset/processed"

CSV_PATH = os.path.join(PROCESSED_DIR, "labels_no_blur.csv")

# -----------------------------
# PARAMETERS
# -----------------------------
IMG_SIZE = (224, 224)   # Resize size for CNNs

# -----------------------------
# LOAD CSV
# -----------------------------
df = pd.read_csv(CSV_PATH)

# Ensure class column exists
if "class" not in df.columns:
    df["class"] = df["image_path"].apply(
        lambda x: "real" if x.lower().startswith("real") else "fake"
    )

print("✅ CSV loaded:", len(df), "images")

# -----------------------------
# LABEL ENCODING
# -----------------------------
le = LabelEncoder()
df["label"] = le.fit_transform(df["class"])  # real=0, fake=1

print("Label Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# -----------------------------
# IMAGE RESIZING + NORMALIZATION
# -----------------------------
images = []
labels = []

for _, row in df.iterrows():
    img_path = os.path.join(RAW_DIR, row["image_path"])
    img = cv2.imread(img_path)

    if img is None:
        continue

    # Resize
    img = cv2.resize(img, IMG_SIZE)

    # Convert BGR → RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Normalize to [0, 1]
    img = img.astype("float32") / 255.0

    images.append(img)
    labels.append(row["label"])

X = np.array(images)
y = np.array(labels)

# -----------------------------
# OUTPUT SUMMARY
# -----------------------------
print("\n✅ Preprocessing Done")
print("X shape (images):", X.shape)   # (N, 224, 224, 3)
print("y shape (labels):", y.shape)


✅ CSV loaded: 7183 images
Label Mapping: {'fake': 0, 'real': 1}

✅ Preprocessing Done
X shape (images): (7183, 224, 224, 3)
y shape (labels): (7183,)
