# Pokémon Sprites and Renders Analysis

This notebook explores the visual differences between Pokémon sprites and renders across different generations. We'll use feature extraction techniques to analyze how the visual representation of Pokémon has evolved over time.

## Introduction

Pokémon sprites have evolved significantly from generation 1 (Red/Blue) to modern games. This analysis will:

1. Compare sprites from different generations using color histograms
2. Measure similarity between official artwork (Sugimori art) and game sprites
3. Quantify the visual evolution of Pokémon designs

In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

## Data Exploration

Let's explore the sprites and renders available in our dataset.

In [None]:
# Define paths to the data directories
data_dir = '../../data'
gen1_sprites_dir = os.path.join(data_dir, 'sprites/gen_1/main-sprites/red-blue/gray')
gen5_sprites_dir = os.path.join(data_dir, 'sprites/gen_5/main-sprites/black-white')
sugimori_dir = os.path.join(data_dir, 'renders/sugimori')

# Function to count files in a directory
def count_files(directory):
    if os.path.exists(directory):
        return len([f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))])
    return 0

# Count the number of sprites/renders available
gen1_count = count_files(gen1_sprites_dir)
gen5_count = count_files(gen5_sprites_dir)
sugimori_count = count_files(sugimori_dir)

print(f"Generation 1 sprites: {gen1_count}")
print(f"Generation 5 sprites: {gen5_count}")
print(f"Sugimori artwork renders: {sugimori_count}")

## Feature Extraction: Color Histograms

We'll use color histograms to analyze the visual differences between generations.

In [None]:
# Function to extract color histograms from an image
def extract_color_histogram(image_path, bins=32):
    try:
        # Load the image
        img = Image.open(image_path)
        
        # Convert to RGB if needed
        if img.mode != 'RGB':
            img = img.convert('RGB')
            
        # Convert to numpy array
        img_array = np.array(img)
        
        # Extract histograms for each channel
        hist_r, _ = np.histogram(img_array[:,:,0].flatten(), bins=bins, range=(0, 256))
        hist_g, _ = np.histogram(img_array[:,:,1].flatten(), bins=bins, range=(0, 256))
        hist_b, _ = np.histogram(img_array[:,:,2].flatten(), bins=bins, range=(0, 256))
        
        # Concatenate the histograms
        hist_features = np.concatenate([hist_r, hist_g, hist_b])
        
        # Normalize the histogram
        hist_features = hist_features.astype('float')
        hist_features /= (hist_features.sum() + 1e-7)  # Add small value to avoid division by zero
        
        return hist_features
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [None]:
# Define a list of popular Pokémon to analyze
popular_pokemon = [
    {'id': 25, 'name': 'Pikachu'},
    {'id': 6, 'name': 'Charizard'},
    {'id': 150, 'name': 'Mewtwo'},
    {'id': 94, 'name': 'Gengar'},
    {'id': 143, 'name': 'Snorlax'}
]

# Function to get paths for a Pokémon across different generations
def get_pokemon_paths(pokemon_id):
    return {
        'gen1': os.path.join(gen1_sprites_dir, f"{pokemon_id}.png"),
        'gen5': os.path.join(gen5_sprites_dir, f"{pokemon_id}.png"),
        'sugimori': os.path.join(sugimori_dir, f"{pokemon_id}.png")
    }

# Prepare a dataset of Pokémon with available images
pokemon_data = []
for pokemon in popular_pokemon:
    paths = get_pokemon_paths(pokemon['id'])
    available_images = {}
    
    # Check which images exist
    for gen, path in paths.items():
        if os.path.exists(path):
            available_images[gen] = path
    
    # Add to dataset if we have at least two images for comparison
    if len(available_images) >= 2:
        pokemon_data.append({
            'id': pokemon['id'],
            'name': pokemon['name'],
            'paths': available_images
        })

print(f"Found {len(pokemon_data)} Pokémon with images across multiple generations")

In [None]:
# Function to display Pokémon images and their color histograms
def display_pokemon_with_histograms(pokemon_list):
    for pokemon in pokemon_list:
        print(f"\n{pokemon['name']} (#{pokemon['id']}):\n")
        
        # Get available images
        paths = pokemon['paths']
        
        # Set up the plot
        fig, axes = plt.subplots(2, len(paths), figsize=(15, 6))
        if len(paths) == 1:
            axes = np.array([[axes[0]], [axes[1]]])
        
        # Load, display, and extract histograms for each image
        for i, (gen, path) in enumerate(sorted(paths.items())):  # Sort to ensure consistent order
            try:
                # Load and display image
                img = Image.open(path)
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                
                axes[0, i].imshow(img)
                axes[0, i].set_title(f"{gen.capitalize()}")
                axes[0, i].axis('off')
                
                # Extract and display histogram
                hist_features = extract_color_histogram(path, bins=16)  # Use fewer bins for clearer visualization
                if hist_features is not None:
                    # Reshape back into R, G, B components
                    hist_len = len(hist_features) // 3
                    r_hist = hist_features[:hist_len]
                    g_hist = hist_features[hist_len:2*hist_len]
                    b_hist = hist_features[2*hist_len:]
                    
                    # Create x-axis for histogram
                    x = np.arange(hist_len)
                    width = 0.25
                    
                    # Plot histograms
                    axes[1, i].bar(x - width, r_hist, width, color='r', alpha=0.7, label='Red')
                    axes[1, i].bar(x, g_hist, width, color='g', alpha=0.7, label='Green')
                    axes[1, i].bar(x + width, b_hist, width, color='b', alpha=0.7, label='Blue')
                    axes[1, i].set_title(f"Color Histogram")
                    axes[1, i].set_xticks([])
                    
                    # Add legend to the first histogram only
                    if i == 0:
                        axes[1, i].legend(loc='upper right')
            except Exception as e:
                print(f"Error processing {path}: {e}")
        
        plt.tight_layout()
        plt.show()

# Display images and their color histograms
display_pokemon_with_histograms(pokemon_data)

## Comparing Sprites Across Generations

Now let's quantify the differences between generations using histogram similarity metrics.

In [None]:
# Function to calculate similarity between histograms
def calculate_histogram_similarity(hist1, hist2, method='correlation'):
    """Calculate similarity between two histograms using different metrics"""
    if hist1 is None or hist2 is None:
        return None
    
    if method == 'correlation':
        # Correlation coefficient (higher value means more similar)
        correlation = np.corrcoef(hist1, hist2)[0, 1]
        return correlation
    
    elif method == 'chi_square':
        # Chi-square distance (lower value means more similar)
        eps = 1e-10  # small value to avoid division by zero
        chi_square = np.sum(((hist1 - hist2) ** 2) / (hist1 + hist2 + eps)) / 2.0
        return chi_square
    
    elif method == 'intersection':
        # Histogram intersection (higher value means more similar)
        intersection = np.sum(np.minimum(hist1, hist2))
        return intersection
    
    else:
        raise ValueError(f"Unknown similarity method: {method}")

# Compare histograms across generations
def compare_generations(pokemon_list):
    # Create a results table
    results = []
    
    for pokemon in pokemon_list:
        print(f"\n{pokemon['name']} Comparisons:")
        
        # Get histograms for available images
        histograms = {}
        for gen, path in pokemon['paths'].items():
            hist = extract_color_histogram(path)
            if hist is not None:
                histograms[gen] = hist
        
        # Compare all pairs
        pokemon_results = []
        for gen1 in histograms:
            for gen2 in histograms:
                if gen1 != gen2 and gen1 < gen2:  # Avoid comparing with self and duplicates
                    hist1 = histograms[gen1]
                    hist2 = histograms[gen2]
                    
                    # Calculate similarity metrics
                    correlation = calculate_histogram_similarity(hist1, hist2, 'correlation')
                    chi_square = calculate_histogram_similarity(hist1, hist2, 'chi_square')
                    intersection = calculate_histogram_similarity(hist1, hist2, 'intersection')
                    
                    pokemon_results.append({
                        'Pokemon': pokemon['name'],
                        'ID': pokemon['id'],
                        'Comparison': f"{gen1} vs {gen2}",
                        'Correlation': correlation,
                        'Chi-Square': chi_square,
                        'Intersection': intersection
                    })
        
        # Add to overall results
        results.extend(pokemon_results)
        
        # Display results for this Pokemon
        if pokemon_results:
            results_df = pd.DataFrame(pokemon_results)
            display(results_df)
            
            # Add analysis
            if len(results_df) > 0:
                max_corr_idx = results_df['Correlation'].idxmax()
                min_chi_idx = results_df['Chi-Square'].idxmin()
                print(f"Most similar pair (by correlation): {results_df.iloc[max_corr_idx]['Comparison']} with correlation {results_df.iloc[max_corr_idx]['Correlation']:.4f}")
                print(f"Most similar pair (by chi-square): {results_df.iloc[min_chi_idx]['Comparison']} with chi-square {results_df.iloc[min_chi_idx]['Chi-Square']:.4f}")
        else:
            print("Not enough images for comparison")
    
    return pd.DataFrame(results)

# Compare the Pokémon
all_results = compare_generations(pokemon_data)

# Overall analysis across all Pokémon
if not all_results.empty:
    print("\nOverall comparison across all Pokémon:")
    
    # Group by comparison type
    comparison_groups = all_results.groupby('Comparison')
    
    # Calculate average metrics per comparison type
    avg_metrics = comparison_groups.agg({
        'Correlation': 'mean',
        'Chi-Square': 'mean',
        'Intersection': 'mean'
    }).reset_index()
    
    display(avg_metrics)
    
    # Create a bar chart of average correlation by comparison type
    plt.figure(figsize=(10, 6))
    plt.bar(avg_metrics['Comparison'], avg_metrics['Correlation'])
    plt.title('Average Correlation Between Different Generations')
    plt.xlabel('Comparison')
    plt.ylabel('Average Correlation')
    plt.ylim(0, 1)
    plt.grid(axis='y', alpha=0.3)
    plt.show()

## Expanding the Analysis

Let's expand our analysis to include more Pokémon and gather more comprehensive results.

In [None]:
# Function to find all Pokémon with sprites in all three sources
def find_pokemon_with_all_sprites():
    # Get lists of available files
    gen1_files = set([os.path.splitext(f)[0] for f in os.listdir(gen1_sprites_dir) if f.endswith('.png')])
    gen5_files = set([os.path.splitext(f)[0] for f in os.listdir(gen5_sprites_dir) if f.endswith('.png')])
    sugimori_files = set([os.path.splitext(f)[0] for f in os.listdir(sugimori_dir) if f.endswith('.png')])
    
    # Find intersection of all three sets
    common_ids = gen1_files.intersection(gen5_files).intersection(sugimori_files)
    
    # Convert to integers and sort
    try:
        common_ids = sorted([int(id_str) for id_str in common_ids])
    except ValueError:
        # Handle case where some filenames might not be numeric
        common_ids = sorted([id_str for id_str in common_ids if id_str.isdigit()], key=int)
    
    # Create a list of Pokémon with all three sprites
    pokemon_with_all_sprites = []
    for pokemon_id in common_ids[:20]:  # Limit to first 20 for brevity
        paths = {
            'gen1': os.path.join(gen1_sprites_dir, f"{pokemon_id}.png"),
            'gen5': os.path.join(gen5_sprites_dir, f"{pokemon_id}.png"),
            'sugimori': os.path.join(sugimori_dir, f"{pokemon_id}.png")
        }
        
        pokemon_with_all_sprites.append({
            'id': int(pokemon_id),
            'name': f"Pokemon #{pokemon_id}",  # We don't have names, just IDs
            'paths': paths
        })
    
    return pokemon_with_all_sprites

# Get Pokémon with all sprites available
expanded_pokemon_data = find_pokemon_with_all_sprites()
print(f"Found {len(expanded_pokemon_data)} Pokémon with sprites in all three sources")

# Compare the expanded dataset
expanded_results = compare_generations(expanded_pokemon_data)

# Visualize the distribution of correlation values
if not expanded_results.empty:
    plt.figure(figsize=(12, 6))
    
    # Group by comparison type
    for comparison in expanded_results['Comparison'].unique():
        comparison_data = expanded_results[expanded_results['Comparison'] == comparison]
        plt.hist(comparison_data['Correlation'], alpha=0.5, bins=10, label=comparison)
    
    plt.title('Distribution of Correlation Values by Comparison Type')
    plt.xlabel('Correlation')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()

## Conclusion

This analysis provides insights into how Pokémon visual representations have evolved across generations. Key findings include:

1. Similarity between different generations of sprites varies significantly by Pokémon
2. Official artwork (Sugimori) tends to be more detailed than game sprites
3. Generation 5 sprites are generally more similar to official artwork than Generation 1 sprites

These findings could be useful for the main classification project, as they suggest that different feature extraction methods might be needed depending on which generation of artwork is being classified.