In [12]:
import numpy as np
from tensorflow.keras import layers, models, datasets, callbacks

# Load the Fashion MNIST dataset, splitting it into training and testing sets.
# x_train, x_test: Grayscale images (originally 28x28 pixels, 0-255 values).
# y_train, y_test: Integer labels (0-9).
(x_train,y_train),(x_test,y_test) = datasets.fashion_mnist.load_data()

# Pre-process the MNIST dataset to be easier to work with.
# 1. Normalize pixel values: Convert to float32 and scale from 0-255 to 0.0-1.0.
# 2. Pad images: Add 2 pixels of zero-padding around each 28x28 image,
#    making them 32x32 pixels. This helps with common CNN input sizes.
# 3. Add channel dimension: For grayscale images, add a channel dimension of 1
#    (e.g., from (32, 32) to (32, 32, 1)). This is required by Keras Conv2D layers.
def preprocess_mnist_image(images):
    images = images.astype("float32") / 255.0
    images = np.pad(images, ((0,0), (2,2), (2,2)), constant_values = 0.0)
    images = np.expand_dims(images, -1)
    return images

x_train = preprocess_mnist_image(x_train)
x_test = preprocess_mnist_image(x_test) 

## Encoders

Encoders can be thought of as feature extractors. They take raw input (in our case, MNIST images) and compress them 
into a compact but informative representation in a latent space/embedding space (the space of all possible outcomes of outputs). So an image, such as pants with pockets, may be encoded into an embedding within the MNIST dataset's latent space, such as coordinates (5.5, -6.3). This isn't just about shrinking data; it's about making it understandable for downstream tasks by highlighting the key underlying features.

## Decoders

Decoders are the counterparts to encoders. Given an encoding/embedding/latency representation, they expand it back into an output. Going with the previous example, a decoder may take the (5.5, -6.3) coordinate and turn it back into an image with pants with pockets. 

## Autoencoders

Autoencoders are made up of encoders and decoders. An autoencoder can take an image, encode it into an embedding, and then decode that same embedding into a similar image as the input. In other words, it can take an image, map it to a point in its embedding space (or latent space), and generate some facsimile of the original version. 

## Encoding: Mapping to a Latent Space

To do all that, we will first need to embed images into a latent space using an encoder.

In [13]:
encoder_input = layers.Input(shape=(32,32,1), name="encoder_input")

# Our encoder will progressively extract features and reduce the dimensionality
# of the input image, mapping it to a lower-dimensional latent space.
# Output shape after this layer: (16, 16, 32)
x = layers.Conv2D(32, (3,3), strides=2, activation="relu", padding="same")(encoder_input)
# Output shape after this layer: (8, 8, 64)
x = layers.Conv2D(64, (3,3), strides=2, activation="relu", padding="same")(x)
# Output shape after this layer: (4, 4, 128)
x = layers.Conv2D(128, (3,3), strides=2, activation="relu", padding="same")(x)

## We'll need this shape later, when we create the decoder.
import tensorflow.keras.backend as K
shape_before_flattening = K.int_shape(x)[1:]

# Finally, we flatten the 3D output of the last convolutional layer (4x4x128) into 
# a 1D vector (4 * 4 * 128 = 2048 elements).
# This is necessary to connect to a fully connected (Dense) layer. Yes, by flattening,
# we DO lose the spatial ifnormation about features that were next to each other in the 
# 2D maps, because the 2D grid structure gets flattened. 
# However, we assume this spatial relationship information has already been effectively captured
# and encoded by the preceding Conv2D layers.
x = layers.Flatten()(x)

# Finally, we create a fully-connected output layer. We specify 2 units, as the dimensionality of
# the latent space representation. Thus, each input image will be compressed into a 2-dimensional vector.
encoder_output = layers.Dense(2, name="encoder_output")(x)



encoder = models.Model(encoder_input, encoder_output)

## Decoder

The decoder does the opposite of the encoder - as such, instead of convolutional layers, it uses convolutional tranpose layers. This uses
many of the same principles as a standard convolutional layer, instead of downsampling, it is used for upsampling. In other words, given a 
low-dimension input such as an embedding (e.g. (3,5) in a latent space), the transpose can reconstruct a higher-resolution output (a picture of clothing).



In [14]:
decoder_input = layers.Input(shape=(2,), name="decoder_input")

## Connect the input layer to a dense layer.
x = layers.Dense(np.prod(shape_before_flattening))(decoder_input)
x = layers.Reshape(shape_before_flattening)(x)

x = layers.Conv2DTranspose(128, (3,3), strides=2, activation="relu", padding="same")(x)
x = layers.Conv2DTranspose(64, (3,3), strides=2, activation="relu", padding="same")(x)
x = layers.Conv2DTranspose(32, (3,3), strides=2, activation="relu", padding="same")(x)

decoder_output = layers.Conv2D(1, (3,3), strides=1, activation="sigmoid", padding="same", name="decoder_output")(x)

decoder = models.Model(decoder_input, decoder_output) 

## AutoEncoder

In [16]:
autoencoder = models.Model(
    encoder_input, decoder(encoder_output)
)
autoencoder.summary()
autoencoder.compile(optimizer="adam", loss="binary_crossentropy")

autoencoder.fit(
    x_train,
    x_train,
    epochs=3,
    batch_size=100,
    shuffle=True,
    validation_data=(x_test, x_test)
)

Epoch 1/3


I0000 00:00:1752206123.286613    1767 service.cc:152] XLA service 0x7c2acc002550 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1752206123.286655    1767 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3090 Ti, Compute Capability 8.6
2025-07-10 23:55:23.456443: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1752206123.932738    1767 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 27/600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m17:02[0m 2s/step - loss: 0.6546

I0000 00:00:1752206130.105051    1767 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - loss: 0.3499 - val_loss: 0.2613
Epoch 2/3
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.2580 - val_loss: 0.2564
Epoch 3/3
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.2554 - val_loss: 0.2538


<keras.src.callbacks.history.History at 0x7c2c3af01ac0>