In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

def load_dataset(dataset_path, max_samples_per_class=50):
    """Load dataset with error handling"""
    images = []
    labels = []
    class_names = []

    if not os.path.exists(dataset_path):
        return generate_synthetic_data(), [], []

    class_dirs = sorted([d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))])

    for class_idx, class_name in enumerate(class_dirs):
        class_path = os.path.join(dataset_path, class_name)
        class_names.append(class_name)
        
        image_files = [f for f in os.listdir(class_path) if f.endswith(('.jpg', '.png', '.jpeg'))]
        loaded_count = 0

        for image_file in image_files:
            if loaded_count >= max_samples_per_class:
                break
                
            image_path = os.path.join(class_path, image_file)
            
            try:
                if os.path.getsize(image_path) == 0:
                    continue
                    
                img = cv2.imread(image_path)
                if img is not None:
                    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                    img_resized = cv2.resize(img_rgb, (64, 64))
                    images.append(img_resized)
                    labels.append(class_idx)
                    loaded_count += 1
            except:
                continue

    if len(images) == 0:
        return generate_synthetic_data()
    
    return np.array(images), np.array(labels), class_names

def generate_synthetic_data():
    """Generate synthetic card data"""
    images = []
    labels = []
    
    for card_idx in range(13):
        for suit_idx in range(4):
            for variation in range(20):
                img = np.zeros((64, 64, 3), dtype=np.uint8)
                
                base_color = [200, 50, 50] if suit_idx < 2 else [50, 50, 50]
                
                if card_idx == 0:
                    img[25:40, 25:40] = base_color
                elif card_idx > 9:
                    img[15:50, 20:45] = base_color
                    img[20:30, 25:40] = [255, 255, 255]
                else:
                    for i in range(min(card_idx + 1, 8)):
                        y = 10 + (i % 4) * 12
                        x = 15 + (i // 4) * 25
                        img[y:y+8, x:x+8] = base_color
                
                noise = np.random.normal(0, 15, (64, 64, 3))
                img = np.clip(img.astype(float) + noise, 0, 255).astype(np.uint8)
                
                images.append(img)
                labels.append(card_idx * 4 + suit_idx)
    
    return np.array(images), np.array(labels)

# Load dataset
dataset_path = '../../data/train_dataset'
card_images, card_labels, card_class_names = load_dataset(dataset_path)

if isinstance(card_images, tuple):
    card_images, card_labels = card_images
    card_class_names = [f"synthetic_{i}" for i in range(len(np.unique(card_labels)))]

print(f"Dataset loaded: {card_images.shape[0]} images, {len(np.unique(card_labels))} classes")

# Display sample images
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for i, ax in enumerate(axes.flatten()):
    if i < len(card_images):
        idx = np.random.randint(0, len(card_images))
        ax.imshow(card_images[idx])
        ax.set_title(f'Class {card_labels[idx]}')
        ax.axis('off')

plt.tight_layout()
plt.show()

In [2]:
import cv2

class PCAFromScratch:
    """
    PCA implementation from scratch for image compression
    """
    def __init__(self, n_components, normalization='column'):
        self.n_components = n_components
        self.normalization = normalization
        self.mean_ = None
        self.components_ = None
        self.explained_variance_ratio_ = None

    def fit(self, X):
        """
        Fit PCA to the data

        Args:
            X: Data matrix (n_samples, n_features)
        """
        # Center the data
        self.mean_ = np.mean(X, axis=0)
        X_centered = X - self.mean_

        # Apply normalization
        if self.normalization == 'column':
            self.std_ = np.std(X_centered, axis=0)
            # Avoid division by zero
            self.std_[self.std_ == 0] = 1
            X_normalized = X_centered / self.std_
        elif self.normalization == 'full':
            self.std_ = np.std(X_centered)
            X_normalized = X_centered / self.std_
        else:
            X_normalized = X_centered
            self.std_ = 1

        # Compute covariance matrix
        cov_matrix = np.cov(X_normalized.T)

        # Compute eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

        # Sort eigenvalues and eigenvectors in descending order
        idx = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

        # Select top n_components
        self.components_ = eigenvectors[:, :self.n_components].T
        self.eigenvalues_ = eigenvalues[:self.n_components]

        # Calculate explained variance ratio
        total_variance = np.sum(eigenvalues)
        self.explained_variance_ratio_ = self.eigenvalues_ / total_variance

        return self

    def transform(self, X):
        """
        Transform data to PCA space

        Args:
            X: Data matrix (n_samples, n_features)

        Returns:
            X_transformed: Transformed data (n_samples, n_components)
        """
        # Center the data
        X_centered = X - self.mean_

        # Apply normalization
        if self.normalization == 'column':
            X_normalized = X_centered / self.std_
        elif self.normalization == 'full':
            X_normalized = X_centered / self.std_
        else:
            X_normalized = X_centered

        # Project onto principal components
        return np.dot(X_normalized, self.components_.T)

    def inverse_transform(self, X_transformed):
        """
        Transform data back to original space

        Args:
            X_transformed: Transformed data (n_samples, n_components)

        Returns:
            X_reconstructed: Reconstructed data (n_samples, n_features)
        """
        # Project back to original space
        X_reconstructed = np.dot(X_transformed, self.components_)

        # Reverse normalization
        if self.normalization == 'column':
            X_reconstructed = X_reconstructed * self.std_
        elif self.normalization == 'full':
            X_reconstructed = X_reconstructed * self.std_

        # Add back the mean
        return X_reconstructed + self.mean_

    def fit_transform(self, X):
        """
        Fit PCA and transform data
        """
        return self.fit(X).transform(X)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def pca_compress_images(images, n_components, normalization='column'):
    """Compress images using PCA"""
    if isinstance(images, list):
        images = np.array(images)
    
    # Flatten images
    original_shape = images.shape
    flattened = images.reshape(images.shape[0], -1)
    
    # Center data
    mean = np.mean(flattened, axis=0)
    centered = flattened - mean
    
    # Normalize
    if normalization == 'column':
        std = np.std(centered, axis=0)
        std[std == 0] = 1
        normalized = centered / std
    else:
        std = np.std(centered)
        normalized = centered / std
    
    # PCA
    cov_matrix = np.cov(normalized.T)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
    
    # Sort descending
    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    
    # Select components
    components = eigenvectors[:, :n_components].T
    
    # Transform and reconstruct
    transformed = np.dot(normalized, components.T)
    reconstructed = np.dot(transformed, components)
    
    # Reverse normalization
    if normalization == 'column':
        reconstructed = reconstructed * std + mean
    else:
        reconstructed = reconstructed * std + mean
    
    # Reshape back
    reconstructed = reconstructed.reshape(original_shape)
    reconstructed = np.clip(reconstructed, 0, 255).astype(np.uint8)
    
    compression_ratio = flattened.shape[1] / n_components
    variance_ratio = np.sum(eigenvalues[:n_components]) / np.sum(eigenvalues)
    
    return reconstructed, compression_ratio, variance_ratio

def show_compression_comparison(original_images, component_counts, num_examples=5):
    """Show compression comparison"""
    num_examples = min(num_examples, len(original_images))
    
    fig, axes = plt.subplots(len(component_counts) + 1, num_examples, 
                           figsize=(4 * num_examples, 4 * (len(component_counts) + 1)))
    
    if len(component_counts) == 0:
        axes = axes.reshape(1, -1)
    elif num_examples == 1:
        axes = axes.reshape(-1, 1)
    
    # Show originals
    for i in range(num_examples):
        axes[0, i].imshow(original_images[i])
        axes[0, i].set_title(f'Original\n{64*64*3:,} dims')
        axes[0, i].axis('off')
    
    # Show compressed versions
    for row, n_comp in enumerate(component_counts, 1):
        compressed_imgs, ratio, variance = pca_compress_images(
            original_images[:num_examples], n_comp)
        
        for i in range(num_examples):
            axes[row, i].imshow(compressed_imgs[i])
            axes[row, i].set_title(f'{n_comp} comp.\n{ratio:.1f}x compression\n{variance:.1%} variance')
            axes[row, i].axis('off')
    
    plt.suptitle('PCA Image Compression Comparison', fontsize=16)
    plt.tight_layout()
    plt.show()

def compress_and_decompress_images(test_images, training_images, n_components):
    """Compress and decompress using separate training set"""
    # Flatten
    X_train = np.array([img.flatten() for img in training_images])
    X_test = np.array([img.flatten() for img in test_images])
    
    # Fit PCA on training data
    mean = np.mean(X_train, axis=0)
    centered = X_train - mean
    std = np.std(centered)
    normalized = centered / std
    
    cov_matrix = np.cov(normalized.T)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
    
    idx = np.argsort(eigenvalues)[::-1]
    components = eigenvectors[:, idx][:, :n_components].T
    
    # Transform test data
    test_centered = X_test - mean
    test_normalized = test_centered / std
    compressed = np.dot(test_normalized, components.T)
    reconstructed = np.dot(compressed, components)
    reconstructed = (reconstructed * std + mean).reshape(-1, 64, 64, 3)
    reconstructed = np.clip(reconstructed, 0, 255).astype(np.uint8)
    
    return compressed, reconstructed

def show_compression_pipeline(original_images, training_images, n_components):
    """Show compression pipeline"""
    compressed_data, reconstructed_images = compress_and_decompress_images(
        original_images, training_images, n_components)
    
    num_examples = len(original_images)
    fig, axes = plt.subplots(2, num_examples, figsize=(4 * num_examples, 8))
    
    if num_examples == 1:
        axes = axes.reshape(-1, 1)
    
    for i in range(num_examples):
        axes[0, i].imshow(original_images[i])
        axes[0, i].set_title("Original")
        axes[0, i].axis('off')
        
        axes[1, i].imshow(reconstructed_images[i])
        axes[1, i].set_title("Reconstructed")
        axes[1, i].axis('off')
    
    compression_ratio = (64 * 64 * 3) / n_components
    plt.suptitle(f'Compression Pipeline - {n_components} components\n{compression_ratio:.1f}x compression')
    plt.tight_layout()
    plt.show()
    
    return compressed_data, reconstructed_images

In [None]:
# Test compression with different component counts
component_counts = [10, 25, 50, 100, 200]
show_compression_comparison(card_images[:5], component_counts, num_examples=5)

In [None]:
# Test compression pipeline
test_samples = card_images[:5]
training_samples = card_images[100:1000]

compressed_data, reconstructed_images = show_compression_pipeline(
    test_samples, training_samples, n_components=50)

# Calculate metrics
original_array = np.array(test_samples)
reconstructed_array = np.array(reconstructed_images)
mse = np.mean((original_array.astype(float) - reconstructed_array.astype(float)) ** 2)
psnr = 20 * np.log10(255.0 / np.sqrt(mse)) if mse > 0 else float('inf')

print(f"Compression metrics:")
print(f"  - MSE: {mse:.2f}")
print(f"  - PSNR: {psnr:.2f} dB")
print(f"  - Compression ratio: {(64*64*3)/50:.1f}x")

In [None]:
# PCA Visualization
def create_pca_scatter(images, labels, n_components=2, sample_size=1000):
    """Create PCA scatter plot"""
    actual_size = min(sample_size, len(images))
    X_sample = np.array([img.flatten() for img in images[:actual_size]])
    y_sample = labels[:actual_size]
    
    # Apply PCA
    mean = np.mean(X_sample, axis=0)
    centered = X_sample - mean
    std = np.std(centered)
    normalized = centered / std
    
    cov_matrix = np.cov(normalized.T)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
    
    idx = np.argsort(eigenvalues)[::-1]
    components = eigenvectors[:, idx][:, :n_components].T
    X_reduced = np.dot(normalized, components.T)
    
    # Plot
    plt.figure(figsize=(10, 8))
    unique_classes = np.unique(y_sample)
    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_classes)))
    
    for i, class_label in enumerate(unique_classes):
        mask = y_sample == class_label
        plt.scatter(X_reduced[mask, 0], X_reduced[mask, 1], 
                   c=[colors[i]], label=f'Class {class_label}', alpha=0.7)
    
    variance_ratio = np.sum(eigenvalues[:n_components]) / np.sum(eigenvalues)
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title(f'PCA Visualization - {variance_ratio:.1%} variance explained')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return X_reduced

# Create PCA visualization
X_2d = create_pca_scatter(card_images, card_labels, sample_size=2000)

In [None]:
# Additional PCA analysis with specific classes
def analyze_specific_classes(images, labels, selected_classes=None):
    """Analyze specific classes with PCA"""
    if selected_classes is None:
        selected_classes = np.unique(labels)[:10]  # First 10 classes
    
    mask = np.isin(labels, selected_classes)
    filtered_images = images[mask]
    filtered_labels = labels[mask]
    
    # Sample up to 50 per class
    sample_indices = []
    for class_label in selected_classes:
        class_indices = np.where(filtered_labels == class_label)[0]
        if len(class_indices) > 0:
            selected = np.random.choice(class_indices, min(50, len(class_indices)), replace=False)
            sample_indices.extend(selected)
    
    X_sample = np.array([filtered_images[i].flatten() for i in sample_indices])
    y_sample = filtered_labels[sample_indices]
    
    # Apply PCA
    mean = np.mean(X_sample, axis=0)
    centered = X_sample - mean
    std = np.std(centered)
    normalized = centered / std
    
    cov_matrix = np.cov(normalized.T)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
    
    idx = np.argsort(eigenvalues)[::-1]
    components = eigenvectors[:, idx][:, :2].T
    X_2d = np.dot(normalized, components.T)
    
    # Plot
    plt.figure(figsize=(12, 8))
    colors = plt.cm.tab10(np.linspace(0, 1, len(selected_classes)))
    
    for i, class_label in enumerate(selected_classes):
        mask = y_sample == class_label
        if np.any(mask):
            plt.scatter(X_2d[mask, 0], X_2d[mask, 1], 
                       c=[colors[i]], label=f'Class {class_label}', alpha=0.7)
    
    variance_ratio = np.sum(eigenvalues[:2]) / np.sum(eigenvalues)
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title(f'Class-Specific PCA Analysis - {variance_ratio:.1%} variance explained')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return X_2d

X_2d_classes = analyze_specific_classes(card_images, card_labels)

In [None]:
# End of notebook