# FEATURE EXTRACTION ANALYSIS


The goal of feature extraction is to transform raw data into a reduced set of informative characteristics, or "features," that capture essential information from the data, often in a lower-dimensional space.

Given a set of input data points $X = {x_0, x_1, ..., x_n}$, where each $x \in \mathbb{R}^D$, the objective is to define a function $f(x, \theta)$ that maps each input $x$ to a new representation \$z = f(x, \theta)$, with $z \in \mathbb{R}^d$. Typically, we aim for $d \ll D$, allowing for a more compact and efficient representation of the data. Here, $\theta$ represents the parameters of the model optimized to capture essential features in a reduced dimension.

## Example 1:  MNIST Digits

### Loading the MNIST dataset

In [1]:
import gzip
import os
import numpy as np
import struct


def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path, f'{kind}-labels-idx1-ubyte.gz')
    images_path = os.path.join(path, f'{kind}-images-idx3-ubyte.gz')

    with gzip.open(labels_path, 'rb') as lbpath:
        _, _ = struct.unpack('>II', lbpath.read(8))
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8)

    with gzip.open(images_path, 'rb') as imgpath:
        _, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        images = np.frombuffer(imgpath.read(), dtype=np.uint8).reshape(num, rows*cols)

    return images, labels

In [2]:
# Load the dataset
mnist_path = '../supplemental_material/MNIST' 
X_train, y_train = load_mnist(mnist_path, kind='train')
X_test, y_test = load_mnist(mnist_path, kind='t10k')

print(f"Training set shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Test set shape: {X_test.shape}, Test labels shape: {y_test.shape}")

Training set shape: (60000, 784), Training labels shape: (60000,)
Test set shape: (10000, 784), Test labels shape: (10000,)


### MLP as feature extractor

In [3]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size=784, hidden_size=128, output_size=64):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, return_features=False):
        out = self.fc1(x)
        out = self.relu(out)
        features = self.fc2(out)
        if return_features:
            return features  # Return features before final activation
        out = self.sigmoid(features)
        return out

In [4]:
# Convert the data to PyTorch tensors and normalize it
X_train_tensor = torch.tensor(X_train, dtype=torch.float32) / 255.0  
X_test_tensor = torch.tensor(X_test, dtype=torch.float32) / 255.0

In [5]:
model = MLP()
with torch.no_grad():
    train_features = model(X_train_tensor, return_features=True)

print(f"Extracted feature shape for training set: {train_features.shape}")

Extracted feature shape for training set: torch.Size([60000, 64])


In [6]:
from sklearn.manifold import TSNE

# Reduce the dimensionality of the features to 2D
tsne = TSNE(n_components=2, random_state=42)
reduced_features = tsne.fit_transform(train_features)

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt


def plot_features(features, labels):
    """
    Reduces precomputed features to 2D using t-SNE and plots them with colors by label.

    Parameters:
        features (np.array): Precomputed feature vectors.
        labels (np.array): Labels for the data points to color the plot.
    """
    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        x=features[:, 0], 
        y=features[:, 1], 
        hue=labels, 
        palette="tab10", 
        s=30, 
        alpha=0.7, 
        edgecolor='k', 
        legend="full"
    )
    plt.title("2D Visualization of Features using t-SNE")
    plt.xlabel("t-SNE Component 1")
    plt.ylabel("t-SNE Component 2")
    plt.legend(title="Label", loc="best")
    plt.show()

ModuleNotFoundError: No module named 'seaborn'

In [None]:
plot_features(reduced_features, y_train)

In [None]:
# load the model weights
weights = os.path.join("data", "weights", "MLP, MNIST.pth")
model.load_state_dict(torch.load(weights))
model.eval()
with torch.no_grad():
    train_features = model(X_train_tensor, return_features=True)

tsne = TSNE(n_components=2, random_state=42)
reduced_features = tsne.fit_transform(train_features)
plot_features(reduced_features, y_train)

### LeNet as feature extractor

In [None]:
class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)  # Output 2 classes: circle and no-circle

    def forward(self, x, return_features=False):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = x.view(-1, 16 * 5 * 5)
        x = torch.relu(self.fc1(x))
        features = torch.relu(self.fc2(x))
        if return_features:
            return features  # Return features before the final layer
        x = self.fc3(features)
        return x

In [None]:
model = LeNet5()
weights = os.path.join("data", "weights", "Lenet5, MNIST.pth")
model.load_state_dict(torch.load(weights))
model.eval()
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).view(-1, 1, 28, 28) / 255.0
with torch.no_grad():
    train_features = model(X_train_tensor, return_features=True)

print(f"Extracted feature shape for training set: {train_features.shape}")

In [None]:
tsne = TSNE(n_components=2, random_state=42)
reduced_features = tsne.fit_transform(train_features)
plot_features(reduced_features, y_train)