In [3]:
import os
import cv2
import numpy as np
from sklearn.utils import shuffle

# Initialize lists
x = []
y = []

# Parameters
target_size = (28, 28)  # Target size for all images

for k in range(0, 10):
    class_dir = os.path.join('dataset_farsi', str(k))
    if not os.path.exists(class_dir):
        print(f"Directory {class_dir} not found, skipping")
        continue
        
    for file in os.listdir(class_dir):
        img_path = os.path.join(class_dir, file)
        
        try:
            # Read and validate image
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is None:
                print(f"Could not read {img_path}, skipping")
                continue
                
            # Ensure all images have the same shape
            if img.shape != target_size:
                img = cv2.resize(img, target_size)
                
            # Invert and normalize
            img = cv2.bitwise_not(img)
            img = img.astype(np.float32) / 255.0
            
            # Add channel dimension if missing
            if len(img.shape) == 2:
                img = np.expand_dims(img, axis=-1)
                
            # Verify final shape
            if img.shape != (target_size[0], target_size[1], 1):
                print(f"Unexpected shape {img.shape} for {img_path}, skipping")
                continue
                
            x.append(img)
            y.append(k)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue

# Verify all images have the same shape before converting to array
if x:
    first_shape = x[0].shape
    for img in x:
        if img.shape != first_shape:
            print(f"Inconsistent shape found: {img.shape} vs {first_shape}")
    
    # Convert to numpy arrays
    x = np.array(x)
    y = np.array(y)
    
    # Shuffle
    x, y = shuffle(x, y, random_state=42)
    
    print("Data loaded successfully")
    print("Input shape:", x.shape)
    print("Labels shape:", y.shape)
else:
    print("No valid images found in the dataset")

Data loaded successfully
Input shape: (80000, 28, 28, 1)
Labels shape: (80000,)
