In [5]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.5.1 scikit-learn-1.6.1 threadpoolctl-3.6.0



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

Defining Paths

In [13]:
raw_folder = os.path.abspath('../data/raw')
processed_folder = os.path.abspath('../data/processed')
os.makedirs(processed_folder, exist_ok=True)

Checking the images

In [14]:
image_files = [f for f in os.listdir(raw_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
print(f"Found {len(image_files)} images.")

Found 529 images.


Train and Test Split

In [15]:
train_files, val_files = train_test_split(image_files, test_size=0.2, random_state=42)
print(f"Training images: {len(train_files)}, Validation images: {len(val_files)}")

Training images: 423, Validation images: 106


Create Folders for Processed Data

In [16]:
for split in ['train', 'val']:
    for mode in ['color', 'gray']:
        os.makedirs(os.path.join(processed_folder, split, mode), exist_ok=True)

Process and Save Images

In [17]:
for split, files in [('train', train_files), ('val', val_files)]:
    for idx, img_file in enumerate(files):
        # Read image
        img_path = os.path.join(raw_folder, img_file)
        color = cv2.imread(img_path)
        if color is None:
            print(f"Could not read {img_file}. Skipping...")
            continue
        
        # Resize to 256x256
        color = cv2.resize(color, (256, 256))
        
        # Convert to grayscale
        gray = cv2.cvtColor(color, cv2.COLOR_BGR2GRAY)
        
        # Save color and grayscale images
        base_name = os.path.splitext(img_file)[0]
        cv2.imwrite(os.path.join(processed_folder, split, 'color', f"{idx}.png"), color)
        cv2.imwrite(os.path.join(processed_folder, split, 'gray', f"{idx}.png"), gray)