In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

from models.pipeline import Pipeline
from utils.view import draw_grid

from models.convolutional_vae_v2 import ConvolutionalVAEV2 
from models.convolutional_vae_v2 import preprocess_grid as preprocess_vaev2, postprocess_grid as postprocess_vaev2
from models.convolutional_vae_v2_vampprior import ConvolutionalVAEV2_VampPrior
from models.convolutional_vae_v2_vampprior import preprocess_grid as preprocess_grid_vp, postprocess_grid as postprocess_grid_vp
from models.convolutional_vae_v3 import ConvolutionalVAEV3
from models.convolutional_vae_v3 import preprocess_grid as preprocess_vaev3, postprocess_grid as postprocess_vaev3
from models.convolutional_vqvae import ConvolutionalVQVAE
from models.convolutional_vqvae import preprocess_grid as preprocess_vq, postprocess_grid as postprocess_vq
from models.fully_connected_vae import FullyConnectedVAE
from models.ppca_pipeline import preprocess_grid as preprocess_ppca_into_vae, postprocess_grid as postprocess_ppca_into_vae
from models.ppca_pipeline import get_compression_functions as get_ppca_into_vae_compresison_functions
from models.vq_pipeline import preprocess_grid as preprocess_vq_into_vae, postprocess_grid as postprocess_vq_into_vae
from models.vq_pipeline import get_compression_functions as get_vq_into_vae_compression_functions


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
model = ConvolutionalVAEV2(
    in_channels=10, 
    starting_filters=64, 
    latent_dim=512,
    feature_dim=[8, 8]
).to(device)

checkpoint = torch.load('../../checkpoints/conv_vaev2_512_b01.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

pipeline = Pipeline(
    model=model,
    preprocess_fn=preprocess_vaev2,
    postprocess_fn=postprocess_vaev2,
)
model_type = "vae"

In [None]:
model = ConvolutionalVAEV3(
    in_channels=10, 
    starting_filters=64, 
    latent_dim=256,
    feature_dim=[4, 4]
).to(device)

checkpoint = torch.load('../../checkpoints/conv_vaev3_256_64_b1.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

pipeline = Pipeline(
    model=model,
    preprocess_fn=preprocess_vaev3,
    postprocess_fn=postprocess_vaev3,
)
model_type = "vae"

In [None]:
model = ConvolutionalVQVAE(
    in_channels=10, 
    starting_filters=64, 
    num_embeddings=256,
    embedding_dim=128,
    commitment_cost=0.25
).to(device)

checkpoint = torch.load('../../checkpoints/conv_vqvae_6x6x128_b1.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

pipeline = Pipeline(
    model=model,
    preprocess_fn=preprocess_vq,
    postprocess_fn=postprocess_vq,
)
model_type = "vq"

In [None]:
input_dim = 6*6*64
model = FullyConnectedVAE(
    input_dim=input_dim,
    hidden_dim=1024,
    latent_dim=64
).to(device)

checkpoint = torch.load('../../checkpoints/vq_vae_64_b1.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

compress_fn, decompress_fn = get_vq_into_vae_compression_functions('../../checkpoints/conv_vqvae_6x6x64_b2.pt')

pipeline = Pipeline(
    model=model,
    preprocess_fn=preprocess_vq_into_vae,
    postprocess_fn=postprocess_vq_into_vae,
    compress_fn=compress_fn,
    decompress_fn=decompress_fn,
)
model_type = "vae"

In [None]:
n_components = 256

# input_dim = n_components
input_dim = 128
model = FullyConnectedVAE(
    input_dim=input_dim,
    hidden_dim=1024,
    latent_dim=64
).to(device)

checkpoint = torch.load('../../checkpoints/ppca128_vae_64_b5.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

compress_fn, decompress_fn = get_ppca_into_vae_compresison_functions(f'../../checkpoints/ppca_{n_components}.pkl')

pipeline = Pipeline(
    model=model,
    preprocess_fn=preprocess_ppca_into_vae,
    postprocess_fn=postprocess_ppca_into_vae,
    compress_fn=compress_fn,
    decompress_fn=decompress_fn,
)
model_type = "vae"

In [3]:
from utils.load_data import get_grids

data, _  = get_grids(filepath="../../data/evaluation", split=False)

In [16]:
def grid_to_latent(pipeline: Pipeline, grid, model_type="vq"):
    grid = pipeline.preprocess_and_compress(grid)
    z = grid
    if model_type == 'vq':
        z = pipeline.encode(grid.unsqueeze(0).to(device))
    else:
        z, _ = pipeline.encode(grid.unsqueeze(0).to(device))
    
    z_size = z.size()
    z_flat = z.view(z.size(0), -1)

    return z_flat, z_size

def extract_transformations(pipeline: Pipeline, train_pairs, model_type="vq"):
    z_diffs = []

    for input, output in train_pairs:
        z_input, _ = grid_to_latent(pipeline, input, model_type)
        z_output, _ = grid_to_latent(pipeline, output, model_type)
        z_diffs.append((z_output - z_input).squeeze(0).detach().cpu().numpy())
    
    return z_diffs


In [None]:
import hdbscan
from sklearn.preprocessing import StandardScaler

train_transformations = {}
test_transformations = {}

for puzzle_id, task in data.items():
    train_trans = extract_transformations(pipeline, task['train'], model_type)
    test_trans = extract_transformations(pipeline, task['test'], model_type)
    
    train_df = pd.DataFrame(train_trans)
    test_df = pd.DataFrame(test_trans)
    
    if len(train_df) > 0:
        scaler = StandardScaler()
        train_scaled = scaler.fit_transform(train_df)
        
        clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=1, prediction_data=True)
        train_labels = clusterer.fit_predict(train_scaled)
        
        train_df['hdbscan_cluster'] = train_labels
        train_df['hdbscan_probabilities'] = clusterer.probabilities_
        
        if len(test_df) > 0:
            test_scaled = scaler.transform(test_df)
            test_labels, test_probs = hdbscan.approximate_predict(clusterer, test_scaled)
            test_df['hdbscan_cluster'] = test_labels
            test_df['hdbscan_probabilities'] = test_probs
    
    train_transformations[puzzle_id] = train_df
    test_transformations[puzzle_id] = test_df

def plot_clusters(df, title, ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 6))
    
    if 'hdbscan_cluster' in df.columns:
        scatter = ax.scatter(df.index, df['hdbscan_probabilities'], c=df['hdbscan_cluster'], cmap='viridis', alpha=0.7)
        legend1 = ax.legend(*scatter.legend_elements(), title="Clusters")
        ax.add_artist(legend1)
    
    ax.set_title(title)
    ax.set_xlabel("Sample Index")
    ax.set_ylabel("HDBSCAN Probability")
    ax.xaxis.set_major_locator(mtick.MaxNLocator(integer=True))
    
    return ax


  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defined clusters, new data'
  warn('Clusterer does not have any defi

In [None]:
import math

n_puzzles = len(train_transformations)


n_cols = min(3, n_puzzles)
n_rows = math.ceil(n_puzzles / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))


if n_puzzles == 1:
    axes = [axes]
elif n_rows == 1:
    axes = [axes]
else:
    axes = axes.flatten()

# # Plot each puzzle
# for i, (puzzle_id, train_df) in enumerate(train_transformations.items()):
#     ax = axes[i] if n_puzzles > 1 else axes[0]
    
#     # Plot training data
#     plot_clusters(train_df, f"Puzzle {puzzle_id} - Train", ax)
    
#     # Add test data if available
#     test_df = test_transformations[puzzle_id]
#     if len(test_df) > 0 and 'hdbscan_cluster' in test_df.columns:
#         # Offset test data slightly for visibility
#         test_x = test_df.index + len(train_df) + 0.5
#         ax.scatter(test_x, test_df['hdbscan_probabilities'], 
#                   c=test_df['hdbscan_cluster'], cmap='viridis', 
#                   alpha=0.7, marker='s', s=50, label='Test')
#         ax.axvline(x=len(train_df) - 0.5, color='red', linestyle='--', alpha=0.5, label='Train/Test Split')
#         ax.legend()

# # Hide unused subplots
# for i in range(n_puzzles, len(axes)):
#     axes[i].set_visible(False)

# plt.tight_layout()
# plt.show()

# Alternative: Plot train and test separately
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Combine all training data
all_train_data = []
all_train_labels = []
all_train_probs = []
puzzle_boundaries = [0]

for puzzle_id, train_df in train_transformations.items():
    if 'hdbscan_cluster' in train_df.columns:
        all_train_data.extend(range(len(train_df)))
        all_train_labels.extend(train_df['hdbscan_cluster'])
        all_train_probs.extend(train_df['hdbscan_probabilities'])
        puzzle_boundaries.append(puzzle_boundaries[-1] + len(train_df))

# Plot all training data
if all_train_data:
    scatter1 = ax1.scatter(all_train_data, all_train_probs, c=all_train_labels, 
                          cmap='viridis', alpha=0.7)
    ax1.set_title("All Training Data Clusters")
    ax1.set_xlabel("Sample Index")
    ax1.set_ylabel("HDBSCAN Probability")
    
    # Add puzzle boundaries
    for boundary in puzzle_boundaries[1:-1]:
        ax1.axvline(x=boundary - 0.5, color='red', linestyle='--', alpha=0.3)
    
    legend1 = ax1.legend(*scatter1.legend_elements(), title="Clusters")
    ax1.add_artist(legend1)

# Combine all test data
all_test_data = []
all_test_labels = []
all_test_probs = []
test_boundaries = [0]

for puzzle_id, test_df in test_transformations.items():
    if len(test_df) > 0 and 'hdbscan_cluster' in test_df.columns:
        all_test_data.extend(range(len(test_df)))
        all_test_labels.extend(test_df['hdbscan_cluster'])
        all_test_probs.extend(test_df['hdbscan_probabilities'])
        test_boundaries.append(test_boundaries[-1] + len(test_df))

# Plot all test data
if all_test_data:
    scatter2 = ax2.scatter(all_test_data, all_test_probs, c=all_test_labels, 
                          cmap='viridis', alpha=0.7, marker='s')
    ax2.set_title("All Test Data Clusters")
    ax2.set_xlabel("Sample Index")
    ax2.set_ylabel("HDBSCAN Probability")
    
    # Add puzzle boundaries
    for boundary in test_boundaries[1:-1]:
        ax2.axvline(x=boundary - 0.5, color='red', linestyle='--', alpha=0.3)
    
    legend2 = ax2.legend(*scatter2.legend_elements(), title="Clusters")
    ax2.add_artist(legend2)

plt.tight_layout()
plt.show()

# Summary statistics
print("Clustering Summary:")
print("-" * 50)
for puzzle_id, train_df in train_transformations.items():
    if 'hdbscan_cluster' in train_df.columns:
        n_clusters = len(set(train_df['hdbscan_cluster'])) - (1 if -1 in train_df['hdbscan_cluster'].values else 0)
        n_noise = sum(train_df['hdbscan_cluster'] == -1)
        avg_prob = train_df['hdbscan_probabilities'].mean()
        
        test_df = test_transformations[puzzle_id]
        test_info = ""
        if len(test_df) > 0 and 'hdbscan_cluster' in test_df.columns:
            test_clusters = len(set(test_df['hdbscan_cluster'])) - (1 if -1 in test_df['hdbscan_cluster'].values else 0)
            test_noise = sum(test_df['hdbscan_cluster'] == -1)
            test_info = f" | Test: {test_clusters} clusters, {test_noise} noise"
        
        print(f"Puzzle {puzzle_id}: {n_clusters} clusters, {n_noise} noise points, avg prob: {avg_prob:.3f}{test_info}")