# 🧼 Data Cleaning Notebook (Updated & Modularized)
This notebook filters, resizes, and moves images from class-based folders using helper functions.

In [None]:
from utils.preprocessing import clean_and_resize_images
import matplotlib.pyplot as plt

In [None]:
# Define paths
source_path = "./data/train_images"      
output_path = "./data/train_images_cleaned"

# Clean & resize
invalids = clean_and_resize_images(source_path, output_path)
print(f"✅ Done cleaning. {len(invalids)} invalid images found.")

In [None]:
# Visualize class distribution in cleaned folder
import os
class_counts = {folder: len(os.listdir(os.path.join(output_path, folder))) for folder in os.listdir(output_path)}

plt.figure(figsize=(10, 5))
plt.bar(class_counts.keys(), class_counts.values(), color='lightgreen')
plt.xticks(rotation=45)
plt.title("Number of Cleaned Images per Class")
plt.xlabel("Class")
plt.ylabel("Image Count")
plt.tight_layout()
plt.show()

In [None]:
# Optionally save invalid image paths
with open("invalid_images_log.txt", "w") as f:
    for path in invalids:
        f.write(path + "\n")

In [None]:
# Split into train and val folders (80/20)
from utils.preprocessing import split_train_val

split_train_val(output_path, train_ratio=0.8)
print("✅ Done splitting train/val.")