# Create Compressed & Noisy EMNIST Letter Training Data
Creates compressed ***EMNIST*** images with imposed Poisson noise.

**Author:** Fabian Santiago  
**Update:** September 19, 2024

***NOTE:*** *Code creates compressed noisy images of dimensions:*

<ul>
  <li><b><em>4x4</em></b>: Compressed and imposed Poisson noise<br></li>
  <li><b><em>7x7</em></b>: Compressed and imposed Poisson noise<br></li>
  <li><b><em>14x14</em></b>: Compressed and imposed Poisson noise<br></li>
  <li><b><em>28x28</em></b>: Only imposed Poisson noise</li>
</ul>

Jupyer Notebook Version: 6.5.4  
Python Version: 3.11.5  
TensorFlow Version: 2.16.2

## Import Modules and Libraries

In [None]:
# Import necessary modules and libraries 
import os
import h5py
import numpy as np
from skimage.util.shape import view_as_windows
from emnist import extract_training_samples
from emnist import extract_test_samples

## Set Values: Seed, Compression, and Output Dimension

In [None]:
# Dimension of output: original are 28 x 28
out_dim  = 28 

## Helper Function Definitions


In [None]:
# Define sliding window compression
def median_downsampling(in_image, cmp_dim):
    if cmp_dim < 15:
        # Compression dimension to window size
        window_size = 28//cmp_dim
        # Create sliding windows
        windows = view_as_windows(in_image, (window_size, window_size), step=window_size)
        # Calculate the median over each window
        return np.median(windows, axis=(2, 3))
    else:
        return in_image

# Define compression of entries in a list
def down_sample_list(in_array,cmp_dim):
    out_array = np.empty((len(in_array), cmp_dim, cmp_dim))
    
    # Iterate over array
    for idx, image in enumerate(in_array):
        out_array[idx] = median_downsampling(image,cmp_dim)
        
    # Return array containing compressed entries 
    return out_array 

# Create compressed noisy data
def create_training_data(clean_train, clean_test, cmp_dim, out_dim):
    
    # Compress images using median in sliding window
    cmp_train = down_sample_list(clean_train, cmp_dim)
    cmp_test  = down_sample_list(clean_test, cmp_dim)

    # Add Poisson noise to compressed images
    noisy_train_ = np.random.poisson(lam=cmp_train)
    noisy_test_  = np.random.poisson(lam=cmp_test)
    
    # Scale to [0,255]
    noisy_train  = np.clip(noisy_train_,0,255)
    noisy_test   = np.clip(noisy_test_,0,255)
    
    # Reshape Compressed/Noisy Arrays and scale to [0,1]
    noisy_train = np.array([matrix.reshape(cmp_dim**2,) for matrix in noisy_train/255])
    noisy_test  = np.array([matrix.reshape(cmp_dim**2,) for matrix in noisy_test/255])
    
    # Return training data
    return noisy_train, noisy_test

## Create Compressed & Noisy Training Data
Load EMNIST data and create training data

In [None]:
directory = 'training_data'

# Create training data directory if nonexistent
if not os.path.exists(directory):
    os.makedirs(directory)
    
# Load EMNIST (og -> original) 
clean_train_og, _ = extract_training_samples('letters')    
clean_test_og, _  = extract_test_samples('letters')
    
for cmp_dim in [4,7,14,28]:
    # Set the training data file name
    dat_file = f'{directory}/emnist_{cmp_dim}x{cmp_dim}_train.h5'
    
    # Create compressed/noisy training/test data
    noisy_train, noisy_test  = create_training_data(clean_train_og, clean_test_og, cmp_dim, out_dim)
        
    # Save compressed/noisy training/test data
    with h5py.File(dat_file, 'w') as f:
        # Save compressed noisy images
        f.create_dataset('noisy_train', data=noisy_train, compression='gzip')
        f.create_dataset('noisy_test', data=noisy_test, compression='gzip')