# 🧹 Data Cleaning Notebook (Updated & Modularized)

This notebook filters, resizes, and moves images from class-based folders using helper functions.

In [None]:
import os
import sys
import matplotlib.pyplot as plt

# ✅ Ensure utils module is in sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from utils.preprocessing import clean_and_resize_images, split_train_val

In [None]:
# 📁 Define paths
source_path = "../data/train_images"
output_path = "../data/data_cleaned/train_images_cleaned_v2"

In [None]:
# ✨ Clean & resize
invalids = clean_and_resize_images(source_path, output_path)
print(f"✅ Done Cleaning. {len(invalids)} invalid images found.")

In [None]:
# 📊 Visualize class distribution in cleaned folder
class_counts = {folder: len(os.listdir(os.path.join(output_path, folder))) 
                for folder in os.listdir(output_path)}

plt.figure(figsize=(10, 5))
plt.bar(class_counts.keys(), class_counts.values(), color='lightgreen')
plt.xticks(rotation=45)
plt.title("Number of Cleaned Images per Class")
plt.xlabel("Class")
plt.ylabel("Image Count")
plt.tight_layout()
plt.show()

In [None]:
# 📝 (Optional) Save invalid image paths
with open("../outputs/invalid_images_log.txt", "w") as f:
    for path in invalids:
        f.write(path + "\n")

In [None]:
# 🧩 Split train/val (80/20)
split_train_val(output_path, train_ratio=0.8)
print("✅ Done splitting train/val.")