# FEATURE EXTRACTION ANALYSIS


The goal of feature extraction is to transform raw data into a reduced set of informative characteristics, or "features," that capture essential information from the data, often in a lower-dimensional space.

Given a set of input data points $X = {x_0, x_1, ..., x_n}$, where each $x \in \mathbb{R}^D$, the objective is to define a function $f(x, \theta)$ that maps each input $x$ to a new representation \$z = f(x, \theta)$, with $z \in \mathbb{R}^d$. Typically, we aim for $d \ll D$, allowing for a more compact and efficient representation of the data. Here, $\theta$ represents the parameters of the model optimized to capture essential features in a reduced dimension.

## Example 1:  MNIST Digits

### Loading the MNIST dataset

In [1]:
import gzip
import os
import numpy as np
import struct


def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path, f'{kind}-labels-idx1-ubyte.gz')
    images_path = os.path.join(path, f'{kind}-images-idx3-ubyte.gz')

    with gzip.open(labels_path, 'rb') as lbpath:
        _, _ = struct.unpack('>II', lbpath.read(8))
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8)

    with gzip.open(images_path, 'rb') as imgpath:
        _, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        images = np.frombuffer(imgpath.read(), dtype=np.uint8).reshape(num, rows*cols)

    return images, labels

In [None]:
# Load the dataset
mnist_path = '../supplemental_material/MNIST' 
X_train, y_train = load_mnist(mnist_path, kind='train')
X_test, y_test = load_mnist(mnist_path, kind='t10k')

print(f"Training set shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Test set shape: {X_test.shape}, Test labels shape: {y_test.shape}")

### MLP as feature extractor

In [3]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size=784, hidden_size=128, output_size=64):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, return_features=False):
        out = self.fc1(x)
        out = self.relu(out)
        features = self.fc2(out)
        if return_features:
            return features  # Return features before final activation
        out = self.sigmoid(features)
        return out

In [4]:
# Convert the data to PyTorch tensors and normalize it
X_train_tensor = torch.tensor(X_train, dtype=torch.float32) / 255.0  
X_test_tensor = torch.tensor(X_test, dtype=torch.float32) / 255.0

In [None]:
model = MLP()
with torch.no_grad():
    test_features = model(X_test_tensor, return_features=True)

print(f"Extracted feature shape for training set: {test_features.shape}")

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt


def plot_features(features, labels):
    """
    Reduces precomputed features to 2D using UMAP and plots them with colors by label.

    Parameters:
        features (np.array): Precomputed feature vectors.
        labels (np.array): Labels for the data points to color the plot.
    """
    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        x=features[:, 0], 
        y=features[:, 1], 
        hue=labels, 
        palette="tab10", 
        s=30, 
        alpha=0.7, 
        edgecolor='k', 
        legend="full"
    )
    plt.title("2D Visualization of Features using UMAP")
    plt.xlabel("UMAP Component 1")
    plt.ylabel("UMAP Component 2")
    plt.legend(title="Label", loc="best")
    plt.show()

In [None]:
import umap

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, n_jobs=1, random_state=42)
umap_embedding = reducer.fit_transform(test_features)

In [None]:
plot_features(umap_embedding, y_test)

In [None]:
# load the model weights
weights = os.path.join("..","classification","data", "weights", "MLP","MNIST.pth")
print(f"Loading model weights from {weights}")
model.load_state_dict(torch.load(weights))
model.eval()
with torch.no_grad():
    test_features = model(X_test_tensor, return_features=True)

umap_embedding = reducer.fit_transform(test_features)
plot_features(umap_embedding, y_test)

### LeNet as feature extractor

In [63]:
class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)  # Output 2 classes: circle and no-circle

    def forward(self, x, return_features=False):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = x.view(-1, 16 * 5 * 5)
        x = torch.relu(self.fc1(x))
        features = torch.relu(self.fc2(x))
        if return_features:
            return features  # Return features before the final layer
        x = self.fc3(features)
        return x

In [None]:
model = LeNet5()
weights = os.path.join("..", "data", "weights", "Lenet5", "MNIST.pth")
model.load_state_dict(torch.load(weights))
model.eval()
with torch.no_grad():
    test_features = model(X_test_tensor, return_features=True)

print(f"Extracted feature shape for training set: {test_features.shape}")

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import umap
import matplotlib.pyplot as plt
from torch.utils.data import Subset, DataLoader

# Step 1: Load MNIST Dataset
transform = transforms.ToTensor()
train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
train_dataset = Subset(train_data, np.random.choice(len(train_data), int(0.5 * len(train_data)), replace=False))
test_dataset = Subset(test_data, np.random.choice(len(test_data), int(0.5 * len(test_data)), replace=False))

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=5000, shuffle=False)

# Step 2: Define a Simple MLP
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(28*28, 256)  # First hidden layer
        self.fc2 = nn.Linear(256, 64)     # Second hidden layer
        self.fc3 = nn.Linear(64, 10)      # Output layer (for classification)
    
    def forward(self, x):
        x = x.view(-1, 28*28)  # Flatten the image
        x = torch.relu(self.fc1(x))  # Hidden layer 1
        hidden_features = torch.relu(self.fc2(x))  # Hidden layer 2 (features)
        output = self.fc3(hidden_features)  # Output layer
        return hidden_features, output  # Return both the hidden features and final output

# Step 3: Initialize the MLP
model = MLP()

# Step 4: Extract Features Before Training (Untrained Network)
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for images, labels in test_loader:
        hidden_features_before, _ = model(images)  # Extract features from second hidden layer before training

# Convert hidden features and labels to numpy for UMAP
hidden_features_before_np = hidden_features_before.numpy()
labels_np = labels.numpy()

# Step 5: Apply UMAP to the untrained features
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, n_jobs=1, random_state=42)
umap_embedding_before = reducer.fit_transform(hidden_features_before_np)

# Step 6: Train the MLP
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 5
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    for images, labels in train_loader:
        _, outputs = model(images)  # Forward pass (we don't need hidden features during training)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

test_images, test_labels = [], []
# Step 7: Extract Features After Training (Trained Network)
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for images, labels in test_loader:
        hidden_features_after, _ = model(images)  # Extract features from second hidden layer after training
        test_images.append(images)
        test_labels.append(labels)

# Flatten the list of batches into a single list of images and labels
test_images = torch.cat(test_images, dim=0).numpy()  # Convert to numpy array for plotting
test_labels = torch.cat(test_labels, dim=0).numpy()  # Convert to numpy array for labels

# Convert hidden features after training to numpy
hidden_features_after_np = hidden_features_after.numpy()

# Step 8: Apply UMAP to the trained features
umap_embedding_after = reducer.fit_transform(hidden_features_after_np)


In [None]:
import matplotlib.patches as mpatches
import numpy as np
import matplotlib.pyplot as plt

# Define the number of unique labels (digits) and the colormap
unique_labels = np.unique(labels_np)
cmap = plt.get_cmap('tab10')  # Discrete colormap for 10 categories

# Create a figure
plt.figure(figsize=(14, 6))

# Before Training
plt.subplot(1, 2, 1)
scatter_before = plt.scatter(umap_embedding_before[:, 0], umap_embedding_before[:, 1], c=labels_np, cmap=cmap, s=5)
plt.title('UMAP Projection of MLP Features (Before Training)', fontsize=12)

# After Training
plt.subplot(1, 2, 2)
scatter_after = plt.scatter(umap_embedding_after[:, 0], umap_embedding_after[:, 1], c=labels_np, cmap=cmap, s=5)
plt.title('UMAP Projection of MLP Features (After Training)', fontsize=12)

# Create a legend manually and position it outside the plot
handles = [mpatches.Patch(color=cmap(i / len(unique_labels)), label=f'Digit {i}') for i in unique_labels]

# Position the legend outside the plot to the right
plt.legend(handles=handles, title="Digits", loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)

# Adjust layout and show the plot
plt.tight_layout(rect=[0, 0, 0.85, 1])  # Make space for the legend
plt.show()


In [45]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import NearestNeighbors

def plot_misclustered_images(cluster_digit, umap_embedding, labels, test_images, num_images=10):
    """
    Plots the `num_images` nearest images with labels different from the given cluster digit.

    Parameters:
    - cluster_digit (int): The digit corresponding to the cluster you want to inspect.
    - umap_embedding (np.ndarray): The 2D UMAP projection of the data.
    - labels (np.ndarray): The true labels for the data points.
    - test_images (np.ndarray): The original test images corresponding to the data points.
    - num_images (int): Number of misclustered images to plot. Default is 10.
    """
    
    # Step 1: Find the center of the cluster corresponding to the given digit
    cluster_indices = np.where(labels == cluster_digit)[0]
    cluster_center = np.mean(umap_embedding[cluster_indices], axis=0)

    # Step 2: Find points with labels different from the given digit
    diff_label_indices = np.where(labels != cluster_digit)[0]

    # Step 3: Compute distances of points with different labels to the cluster center
    distances_to_cluster = np.linalg.norm(umap_embedding[diff_label_indices] - cluster_center, axis=1)

    # Step 4: Select the closest points with different labels
    closest_indices = diff_label_indices[np.argsort(distances_to_cluster)[:num_images]]

    # Step 5: Plot the corresponding images
    plt.figure(figsize=(2*num_images, num_images))
    for idx, mislabeled_index in enumerate(closest_indices):
        plt.subplot(1, num_images, idx + 1)
        plt.imshow(test_images[mislabeled_index].reshape(28, 28), cmap='gray')
        plt.title(f"True Label: {labels[mislabeled_index]}")
        plt.axis('off')

    plt.tight_layout()
    plt.show()


In [None]:
plot_misclustered_images(cluster_digit=1, umap_embedding=umap_embedding_after, labels=test_labels, test_images=test_images, num_images=4)

In [None]:
plot_misclustered_images(cluster_digit=0, umap_embedding=umap_embedding_after, labels=test_labels, test_images=test_images, num_images=4)