In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [2]:
import os
import random
import textwrap as tw
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from ast import literal_eval
from scipy.spatial.distance import pdist, squareform
import numba
from emv.db.dao import DataAccessObject
from sqlalchemy.sql import text

from emv.features.pose import load_poses 
from emv.features.pose_utils import draw_pose, CONNECTIONS, KEYPOINTS_NAMES, ANGLES_ASSOCIATIONS

# Clustering
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN, KMeans
from hdbscan import HDBSCAN

# DR
from umap import UMAP
from umap.umap_ import nearest_neighbors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from trimap import TRIMAP
import pymde

# Metrics
from emv.embeddings.dr_eval import compute_embeddings, compute_umap_embeddings, plot_embeddings, format_params
from emv.embeddings.dr_eval import \
    compute_coranking_metrics, \
    random_triplet_accuracy, \
    compute_pcc, \
    global_score

In [None]:
# Retrieve the transformed features for the poses, feature_type = 'pose_filtered' is a smaller dataset than just 'pose'
# In comparison to the full data, it has been filtered to only keep "interesting" poses for each sport
query = text("SELECT * FROM feature WHERE feature_type = 'pose_filtered'")
pose_df = pd.DataFrame(DataAccessObject().fetch_all(query))
pose_df['embedding_33'] = pose_df['embedding_33'].apply(lambda x: literal_eval(x))
pose_df["sport"] = pose_df.data.map(lambda x: x["sport"])

print(f"{pose_df.shape[0]} poses retrieved")

In [None]:
pose_df.sport.value_counts()

# Embedding

In [None]:
N_sample = 500
sport_poses = []
for sport in pose_df.sport.unique():
    n_poses_in_sport = len(pose_df[pose_df.sport == sport])
    if n_poses_in_sport < N_sample:
        sport_poses.append(pose_df[pose_df.sport == sport])
    else:
        sport_poses.append(pose_df[pose_df.sport == sport].sample(N_sample, random_state=42))
sport_poses = pd.concat(sport_poses)
sport_poses = sport_poses.reset_index(drop=True)
print(f"Testing with {len(sport_poses)} poses.")

features = np.array(sport_poses["embedding_33"].tolist())

colors = sport_poses.sport.map(lambda x: sns.color_palette("Set2", n_colors=len(sport_poses.sport.unique()))[list(sport_poses.sport.unique()).index(x)])

### 2D Embeddings

In [None]:
# PCA embeddings
features_embeddings = [compute_embeddings(features = features, reducer = PCA, params = {"n_components": 2})]

In [None]:
# UMAP embeddings
n_neighbors = [50, 100, 500]
features_embeddings.extend(compute_umap_embeddings(features = features, n_neighbors = n_neighbors, min_dist = 0.1))

In [None]:
# TSNE embeddings
perps = [5, 10, 50, 100]
for perp in perps:
    features_embeddings.append(compute_embeddings(features = features, reducer = TSNE, params = {"n_components": 2, "metric": "cosine", "perplexity": perp}))

In [None]:
# TRIMAP embeddings
n_inliers_values = [10, 20, 50] # Ratio of 2:1:1 for n_inliers:n_outliers:n_random (as recommended in the paper)
for n in n_inliers_values:
    m = int(0.5 * n)
    features_embeddings.append(compute_embeddings(features = features, reducer = TRIMAP, params = {"n_inliers": n, "n_outliers": m, "n_random": m, "distance": "cosine"}))

In [None]:
plot_embeddings(features_embeddings, fig_title = "Human angles embeddings")

### 3D Embeddings

In [None]:
threeD_umap = UMAP(min_dist = 0.5, n_neighbors = 100, n_components=3).fit(features)

In [None]:
SELECTED_SPORTS = ["Weightlifting", "Cycling"]
sport_ids = []
for sport in SELECTED_SPORTS:
    sport_ids.extend(sport_poses[sport_poses.sport == sport].index.tolist())

x = threeD_umap.embedding_[:, 0]
y = threeD_umap.embedding_[:, 1]
z = threeD_umap.embedding_[:, 2]
# Flatten z to 0 if id not in sport_ids
z_selected = np.array([z[i] if i in sport_ids else 0 for i in range(len(z))])
ms = [2 if i in sport_ids else 0.5 for i in range(len(z))]

fig = plt.figure(figsize=(18, 6))

ax1 = fig.add_subplot(131, projection = "3d")
ax1.scatter(x, y, z, c=colors, s=0.5)
ax1.set_title("UMAP 3D", fontweight = "bold")
ax1.view_init(10, 45)

ax2 = fig.add_subplot(132, projection='3d')
ax2.scatter(x, y, z_selected, c=colors, s=ms)
ax2.set_title("Selected sport: " + ", ".join(SELECTED_SPORTS), fontweight = "bold")
ax2.view_init(10, 45)

ax3 = fig.add_subplot(133)
ax3.scatter(x, y, c=colors, s=0.5)
ax3.set_title("Top-down view", fontweight = "bold")

plt.show()

### Sphere surface embeddings

In [None]:
sphere_mapper = UMAP(output_metric='haversine', min_dist=0.1, n_neighbors=100, random_state=42).fit(features)

In [None]:
x = np.sin(sphere_mapper.embedding_[:, 0]) * np.cos(sphere_mapper.embedding_[:, 1])
y = np.sin(sphere_mapper.embedding_[:, 0]) * np.sin(sphere_mapper.embedding_[:, 1])
z = np.cos(sphere_mapper.embedding_[:, 0])

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c=colors, s=0.5)
ax.view_init(10, 45)

In [None]:
lat = np.arctan2(x, y)
long = -np.arccos(z)

plt.scatter(lat, long, c=colors, s=0.1)

### Torus surface embeddings

In [None]:
@numba.njit(fastmath=True)
def torus_euclidean_grad(x, y, torus_dimensions=(2*np.pi,2*np.pi)):
    """Standard euclidean distance.

    ..math::
        D(x, y) = \sqrt{\sum_i (x_i - y_i)^2}
    """
    distance_sqr = 0.0
    g = np.zeros_like(x)
    for i in range(x.shape[0]):
        a = abs(x[i] - y[i])
        if 2*a < torus_dimensions[i]:
            distance_sqr += a ** 2
            g[i] = (x[i] - y[i])
        else:
            distance_sqr += (torus_dimensions[i]-a) ** 2
            g[i] = (x[i] - y[i]) * (a - torus_dimensions[i]) / a
    distance = np.sqrt(distance_sqr)
    return distance, g/(1e-6 + distance)

In [None]:
torus_mapper = UMAP(output_metric=torus_euclidean_grad, min_dist=0.1, n_neighbors=100,  random_state=42).fit(features)

In [None]:
R = 1.5 # Size of the doughnut circle
r = 1 # Size of the doughnut cross-section

x = (R + r * np.cos(torus_mapper.embedding_[:, 0])) * np.cos(torus_mapper.embedding_[:, 1])
y = (R + r * np.cos(torus_mapper.embedding_[:, 0])) * np.sin(torus_mapper.embedding_[:, 1])
z = r * np.sin(torus_mapper.embedding_[:, 0])

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c=colors, s = 0.1)
ax.set_zlim3d(-3, 3)
ax.view_init(35, 70)
ax.set_title("Torus embedding", fontweight = "bold")
plt.show()

### Torus volume embeddings

In [None]:
@numba.njit(fastmath=True)
def torus_volume_euclidean_grad(x, y, torus_dimensions=(2*np.pi, 2*np.pi)):
    """Euclidean distance and gradient in the full volume of a torus."""
    best_distance_sqr = np.inf
    best_g = np.zeros_like(x)
    
    for dx in [-1, 0, 1]:
        for dy in [-1, 0, 1]:
            shift = np.array([dx, dy]) * np.array(torus_dimensions)
            distance_sqr = 0.0
            g = np.zeros_like(x)
            
            for i in range(x.shape[0]):
                a = abs(x[i] - (y[i] + shift[i]))
                if a < 0.5 * torus_dimensions[i]:
                    distance_sqr += a ** 2
                    g[i] = (x[i] - (y[i] + shift[i]))
                else:
                    distance_sqr += (torus_dimensions[i] - a) ** 2
                    g[i] = (x[i] - (y[i] + shift[i])) * (a - torus_dimensions[i]) / a

            if distance_sqr < best_distance_sqr:
                best_distance_sqr = distance_sqr
                best_g = g

    distance = np.sqrt(best_distance_sqr)
    return distance, best_g / (1e-6 + distance)

In [None]:
torus_volume_mapper = UMAP(output_metric=torus_volume_euclidean_grad, min_dist=0.1, n_neighbors=100,  random_state=42).fit(features)

In [None]:
torus_volume_mapper.embedding_.shape

In [None]:
R = 1.5 # Size of the doughnut circle
r = 1 # Size of the doughnut cross-section

x = (R + r * np.cos(torus_mapper.embedding_[:, 0])) * np.cos(torus_mapper.embedding_[:, 1])
y = (R + r * np.cos(torus_mapper.embedding_[:, 0])) * np.sin(torus_mapper.embedding_[:, 1])
z = r * np.sin(torus_mapper.embedding_[:, 0])

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c=colors, s = 0.1)
ax.set_zlim3d(-3, 3)
ax.view_init(30, 45)
ax.set_title("Torus embedding", fontweight = "bold")
plt.show()

### Cylinder surface embeddings

In [None]:
@numba.njit(fastmath=True)
def cylinder_euclidean_grad(x, y, cylinder_dimension=2*np.pi, linear_dimension=1.0):
    """Euclidean distance and gradient for cylindrical projection.

    x, y: Points between which the distance and gradient are computed.
    cylinder_dimension: The dimension of the cylindrical wraparound (default 2*pi).
    linear_dimension: The linear dimension (default 1.0).
    """
    distance_sqr = 0.0
    g = np.zeros_like(x)
    
    # Cylindrical dimension (e.g., angular wraparound)
    a = abs(x[0] - y[0])
    if 2 * a < cylinder_dimension:
        distance_sqr += a ** 2
        g[0] = (x[0] - y[0])
    else:
        distance_sqr += (cylinder_dimension - a) ** 2
        g[0] = (x[0] - y[0]) * (a - cylinder_dimension) / a
    
    # Linear dimension (e.g., height)
    b = abs(x[1] - y[1])
    distance_sqr += b ** 2
    g[1] = (x[1] - y[1])
    
    distance = np.sqrt(distance_sqr)
    return distance, g / (1e-6 + distance)

In [None]:
cylinder_mapper = UMAP(output_metric=cylinder_euclidean_grad, min_dist=0.1, n_neighbors=100, random_state=42).fit(features)

In [None]:
# Panorama dimensions
R_pano = 4.5
H_pano = 3.55

# Cylindrical dimension (theta) and height (h)
cylinder_dimension = 2 * np.pi
radius = R_pano  # Radius of the cylinder

# Extract the cylindrical (theta) and linear (h) coordinates
theta_coords = cylinder_mapper.embedding_[:, 0] % cylinder_dimension
h_coords = cylinder_mapper.embedding_[:, 1]
h_coords = H_pano * (h_coords - np.min(h_coords)) / (np.max(h_coords) - np.min(h_coords)) # Remap height to [0, H_pano] size of the Panorama

# Convert cylindrical coordinates to Cartesian coordinates
x = radius * np.cos(theta_coords)
y = radius * np.sin(theta_coords)
z = h_coords

fig = plt.figure(figsize=(20, 8))
ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(x, y, z, c=colors, s = 0.1)
ax1.set_title("Cylindrical projection", fontweight = "bold")

ax2 = fig.add_subplot(122)
ax2.scatter(theta_coords, h_coords, c=colors, s = 0.5)
ax2.set_title("Unwrapped cylinder", fontweight = "bold")

plt.tight_layout()
plt.show()

### Testing PyMDE

In [None]:
mde = pymde.preserve_neighbors(features, n_neighbors=100, embedding_dim=2, verbose = True)
embeddings = mde.embed(verbose = True)
pymde.plot(embeddings, color_by=sport_poses.sport, figsize_inches=(6,6))

In [None]:
mde = pymde.preserve_neighbors(features, n_neighbors=100, embedding_dim=2, constraint=pymde.Standardized(), verbose = True)
embeddings = mde.embed(verbose = True)
pymde.plot(embeddings, color_by=sport_poses.sport, figsize_inches=(6,6))

In [None]:
mde = pymde.preserve_neighbors(features, n_neighbors=100, embedding_dim=3, verbose = True)
embeddings = mde.embed(verbose = True)
pymde.plot(embeddings, color_by=sport_poses.sport, figsize_inches=(6,6))

In [None]:
mde = pymde.preserve_neighbors(features, n_neighbors=100, embedding_dim=5, constraint=pymde.Standardized())
embeddings = mde.embed()

reducer = UMAP(n_neighbors=100, min_dist=0.1, n_components=2)
umap_embeddings = reducer.fit_transform(embeddings)

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=colors, s=0.1)
plt.title("UMAP embedding of the 5D MDE embedding")
plt.show()

## Adding metadata to the features

In [6]:
# Sport one-hot encoding
sport_poses['sport_enc'] = pd.get_dummies(sport_poses['sport']).astype(int).values.tolist()

In [None]:
projections = {}

scales = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5]
for scale in scales:
    mapper = UMAP(n_components=2, min_dist=0.1, n_neighbors=100, random_state=42)
    data = np.array(sport_poses.apply(lambda df: np.concatenate([df["embedding_33"], scale * np.array(df["sport_enc"])]), axis=1)).tolist()
    embedding = mapper.fit_transform(data)
    
    projections[scale] = embedding

In [None]:
fig = plt.figure(figsize=(20, 6)) 

anchor_embedding = projections[scales[-1]]

for i, scale in enumerate(scales):
    ax = fig.add_subplot(1, 6, i + 1)
    aligned_embedding = pymde.align(source=projections[scale], target=anchor_embedding)
    ax.scatter(aligned_embedding[:, 0], aligned_embedding[:, 1], c=colors, s = 0.1)
    ax.set_title(f"Metadata scale {scale}", fontweight = "bold")
plt.tight_layout()
plt.show()

In [None]:
cylinder_projections = {}

scales = [0.01, 0.1, 0.5, 1]
for scale in scales:
    mapper = UMAP(output_metric=cylinder_euclidean_grad, min_dist=0.1, n_neighbors=100, random_state=42)
    data = np.array(sport_poses.apply(lambda df: np.concatenate([df["embedding_33"], scale * np.array(df["sport_enc"])]), axis=1)).tolist()
    embedding = mapper.fit_transform(data)
    
    cylinder_projections[scale] = embedding

In [None]:
# Panorama dimensions
R_pano = 1
H_pano = 1

# Cylindrical dimension (theta) and height (h)
cylinder_dimension = 2 * np.pi
radius = R_pano  # Radius of the cylinder

anchor_embedding = cylinder_projections[scales[-1]]

fig = plt.figure(figsize=(20, 6))
for i, scale in enumerate(scales):
    if i != 3:
        aligned_embedding = pymde.align(source=cylinder_projections[scale], target=anchor_embedding)
    else:
        aligned_embedding = cylinder_projections[scale]

    # Extract the cylindrical (theta) and linear (h) coordinates
    theta_coords = aligned_embedding[:, 0] % cylinder_dimension
    h_coords = aligned_embedding[:, 1]
    #h_coords = H_pano * (h_coords - np.min(h_coords)) / (np.max(h_coords) - np.min(h_coords)) # Remap height to [0, H_pano] size of the Panorama

    # Convert cylindrical coordinates to Cartesian coordinates
    x = radius * np.cos(theta_coords)
    y = radius * np.sin(theta_coords)
    z = h_coords

    ax = fig.add_subplot(1, 4, i + 1, projection='3d')
    ax.scatter(x, y, z, c=colors, s = 0.1)
    ax.set_title(f"Metadata scale {scale}", fontweight = "bold")
    
plt.tight_layout()
plt.show()

In [None]:
projections = {}

scales = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5]
for scale in scales:
    mapper = UMAP(n_components=3, min_dist=0.1, n_neighbors=100, random_state=42)
    data = np.array(sport_poses.apply(lambda df: np.concatenate([df["embedding_33"], scale * np.array(df["sport_enc"])]), axis=1)).tolist()
    embedding = mapper.fit_transform(data)
    
    projections[scale] = embedding

In [None]:
fig = plt.figure(figsize=(20, 12)) 

embeddings_aligned = {}
for i, scale in enumerate(scales):
    ax = fig.add_subplot(2, 3, i + 1, projection='3d')
    if i > 0:
        anchor_embedding = projections[scales[i - 1]]
        aligned_embedding = np.array(pymde.align(source=projections[scale], target=anchor_embedding))
    else:
        aligned_embedding = projections[scale]
    embeddings_aligned[scale] = aligned_embedding
    ax.scatter(aligned_embedding[:, 0], aligned_embedding[:, 1], aligned_embedding[:, 2], c=colors, s = 0.1)
    ax.set_title(f"Metadata scale {scale}", fontweight = "bold")
plt.tight_layout()
plt.show()

### Create projection

In [None]:
from emv.api.models import Projection, MapProjectionFeatureCreate
from emv.db.queries import create_projection, create_map_projection_feature

total_tiles = len(sport_poses) # either all features or a subset of features
atlas_width = 4096
max_tile_size = 512
max_tiles_per_atlas = (atlas_width // max_tile_size) ** 2
atlas_count = int(total_tiles / max_tiles_per_atlas) + 1

print(f"Total tiles: {total_tiles}, max tiles per atlas: {max_tiles_per_atlas} => atlas count: {atlas_count}")

In [None]:
for scale, embedding in embeddings_aligned.items():
    if scale == 0.01:
        continue
    
    # Create the projection, replace the names with the desired ones, library_id = 2 is for the IOC
    projection = Projection(
        projection_name=f"IOC Poses + Sport (scale {scale})",
        version="0.0.1",
        library_id=2,
        model_name="openpifpaf_fast",
        model_params={},
        data={},
        dimension=3,
        atlas_folder_path="",
        atlas_width=atlas_width,
        tile_size=max_tile_size,
        atlas_count=atlas_count,
        total_tiles=total_tiles,
        tiles_per_atlas=max_tiles_per_atlas,
    )

    projection_id = create_projection(projection)['projection_id']

    # Create an entry in the map_projection_feature table for each feature, links features, media and coordinates
    for i, row in sport_poses.iterrows():
        create_map_projection_feature(MapProjectionFeatureCreate(
            projection_id=projection_id,
            media_id=row.media_id,
            atlas_order=-1,
            index_in_atlas=-1,
            coordinates=[embedding[i, 0], embedding[i, 1], embedding[i, 2]],
            feature_id=row.feature_id
        ))
        
    print(f"Projection {projection_id} created with {total_tiles} tiles.")

In [None]:
sport_poses.head(2)

## Emblaze Visualisation

In [None]:
import emblaze

In [None]:
every_nth = 1

emb = emblaze.Embedding({emblaze.Field.POSITION: features[::every_nth], emblaze.Field.COLOR: colors[::every_nth]}, metric="cosine")
emb.compute_neighbors()

In [None]:
variants = emblaze.EmbeddingSet([
    emb.project(method="umap", n_neighbors=n, min_dist=0.1) for n in [50, 100, 500]
])

variants.compute_neighbors(metric="cosine")

In [None]:
def draw_annotation_pose(pose, threshold = 0.1, linewidth = 5, color = "black", alpha = 1):
    fig = plt.figure(figsize=(4,4))
    
    keypoints = pose["data"]["keypoints"]
    plt.scatter([k[0] for k in keypoints if k[2] > threshold], 
                [k[1] for k in keypoints if k[2] > threshold], 
                s=10, color=color, alpha = alpha)
    for c in CONNECTIONS:
        k1 = keypoints[KEYPOINTS_NAMES.index(c[0])]
        k2 = keypoints[KEYPOINTS_NAMES.index(c[1])]
        if k1[2] > threshold and k2[2] > threshold:
            plt.plot([k1[0], k2[0]], 
                    [k1[1], k2[1]], 
                    linewidth=linewidth, color=color, alpha = alpha)
    
    # Invert y axis
    plt.gca().invert_yaxis()
    
    plt.axis("off")
    plt.gca().patch.set_alpha(0)
    
    buffer = BytesIO()
    fig.savefig(buffer)
    buffer.seek(0)
    plt.close(fig)
    
    img = Image.open(buffer).resize((128,128), Image.Resampling.BICUBIC)
    
    img = img.convert("RGBA")  
    datas = img.getdata()
    newData = []
    for item in datas:
        if item[0] == 255 and item[1] == 255 and item[2] == 255:
            newData.append((255, 255, 255, 0))
        else:
            newData.append(item)
    img.putdata(newData)
    
    return img

In [None]:
thumbnails = []
for i,pose in tqdm(sport_poses[::every_nth].iterrows(), total = len(sport_poses[::every_nth])):
    img = draw_annotation_pose(pose)
    thumbnails.append(img)
    
thumbnails = emblaze.ImageThumbnails([np.array(thumbnail) for thumbnail in thumbnails])

In [None]:
w = emblaze.Viewer(embeddings = variants, thumbnails = thumbnails)
w

### Plot with thumbnails

In [None]:
from io import BytesIO
from PIL import Image
import base64

from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from emv.features.pose_utils import CONNECTIONS, KEYPOINTS_NAMES

In [None]:
def draw_annotation_pose(pose, threshold = 0.1, linewidth = 5, color = "black", alpha = 1):
    fig = plt.figure(figsize=(4,4))
    
    keypoints = pose["data"]["keypoints"]
    plt.scatter([k[0] for k in keypoints if k[2] > threshold], 
                [k[1] for k in keypoints if k[2] > threshold], 
                s=10, color=color, alpha = alpha)
    for c in CONNECTIONS:
        k1 = keypoints[KEYPOINTS_NAMES.index(c[0])]
        k2 = keypoints[KEYPOINTS_NAMES.index(c[1])]
        if k1[2] > threshold and k2[2] > threshold:
            plt.plot([k1[0], k2[0]], 
                    [k1[1], k2[1]], 
                    linewidth=linewidth, color=color, alpha = alpha)
    
    # Invert y axis
    plt.gca().invert_yaxis()
    
    plt.axis("off")
    plt.gca().patch.set_alpha(0)
    
    buffer = BytesIO()
    fig.savefig(buffer)
    buffer.seek(0)
    plt.close(fig)
    
    img = Image.open(buffer).resize((128,128), Image.Resampling.BICUBIC)
    
    img = img.convert("RGBA")  
    datas = img.getdata()
    newData = []
    for item in datas:
        if item[0] == 255 and item[1] == 255 and item[2] == 255:
            newData.append((255, 255, 255, 0))
        else:
            newData.append(item)
    img.putdata(newData)
    
    return img

In [None]:
sports = sport_poses.sport.unique()
colors = sns.color_palette("hsv", len(sports))
colors_map = {s: c for s,c in zip(sports, colors)}

In [None]:
#embeddings = compute_umap_embeddings(features = features, n_neighbors = [500], min_dist = 0.5)[0]["embeddings"]
embeddings = features_embeddings[-1]["embeddings"]

In [None]:
# Plot of the embedding colored by sport
plt.figure(figsize=(8,8))
plt.scatter(embeddings[:,0], embeddings[:,1], c = [colors_map[s] for s in sport_poses.sport], s = 0.1)
plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=s, markerfacecolor=colors_map[s], markersize=10) for s in sports], loc = [1.01, 0])
plt.show()

In [None]:
#sport_poses["embedding"] = embeddings.tolist()

n_rows = 4
n_cols = int(len(sports) / n_rows) + 1
d = 3

fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * d, n_rows * d))
axs = axs.flatten()
for i,sport in enumerate(sports):
    axs[i].scatter(embeddings[sport_poses.sport == sport,0], embeddings[sport_poses.sport == sport,1], c = colors_map[sport], s = 0.1)
    axs[i].set_title(sport)
    axs[i].set_xlim(-0.1,1.1)
    axs[i].set_ylim(-0.1,1.1)
    axs[i].set_xticks([])
    axs[i].set_yticks([])
[ax.set_visible(False) for ax in axs[len(sports):]]
plt.tight_layout()
plt.show()

In [None]:
# Show a maximum of N thumbnails on the plot
N = 1000
EVERY_N = int(len(sport_poses) / N) 
if EVERY_N < 1:
    EVERY_N = 1
    
sample_poses = sport_poses.iloc[::EVERY_N]

thumbnails = []
for i,pose in tqdm(sample_poses.iterrows()):
    #color = colors_map[pose["sport"]]
    img = draw_annotation_pose(pose, color = "black")
    thumbnails.append(img)

In [None]:
from emv.embeddings.dr_eval import plot_embeddings_with_images

#embeddings = features_embeddings[3]["embeddings"][::EVERY_N]
sample_embeddings = embeddings[::EVERY_N]
plot_embeddings_with_images(sample_embeddings, thumbnails, zoom = 0.2, figsize = 20)

### Nearest Neighbors

In [None]:
k = 5
radius = 0.05

knn = NearestNeighbors(radius = radius)
knn.fit(sample_embeddings)
dists, ids = knn.radius_neighbors(sample_embeddings)

sample_poses["coords"] = list(sample_embeddings)
sample_poses["nearest_ids"] = list(ids)
sample_poses["nearest_sports"] = sample_poses.nearest_ids.map(lambda x: Counter(sample_poses.iloc[x].sport.tolist()).most_common())


for _ in range(10):
    fig, axs = plt.subplots(1, 3, figsize=(18, 6))
    random_pose = sample_poses.sample(1)
    
    draw_pose(random_pose.squeeze(axis = 0), ax = axs[0], show_frame = True)
    
    axs[1].scatter(sample_embeddings[:,0], sample_embeddings[:,1], s = 1)
    axs[1].scatter(random_pose["coords"].values[0][0], random_pose["coords"].values[0][1], s = 30, color = "red", marker = "x")
    axs[1].add_patch(plt.Circle((random_pose["coords"].values[0][0], random_pose["coords"].values[0][1]), radius, color='red', fill = False))

    axs[1].set_title(f"Embedded poses - Matched pose from {random_pose.sport.values[0]}")
    axs[1].set_xticks([])
    axs[1].set_yticks([])

    matches = random_pose["nearest_sports"].values[0][:k][::-1]
    total_matches = np.sum([m[1] for m in matches])
    axs[2].barh([m[0] for m in matches], [m[1] / total_matches for m in matches])
    axs[2].set_title("Top sports for the top100 nearest poses")

    plt.tight_layout()
    plt.show()

## Evaluation

### Co-ranking Metrics: Trustworthiness and Continuity

References:
* https://towardsdatascience.com/on-the-validating-umap-embeddings-2c8907588175
* https://github.com/MoritzM00/drcomp/tree/main

In [None]:
ks = [10, 50, 100, 500, 1000]
for result in features_embeddings:
    t_values, c_values = compute_coranking_metrics(features, result["embeddings"], ks = ks)
    result["trustworthiness"] = t_values
    result["continuity"] = c_values

In [None]:
linestyles = {"PCA": "-.", "TSNE": "--", "UMAP": ":", "TRIMAP": "-"}
plt.figure(figsize=(10, 5)) 

for i, result in enumerate(features_embeddings):
    plt.plot(ks, result["trustworthiness"], 
             label = f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", 
             marker = "x", 
             linestyle = linestyles[result["reducer"].__name__])

plt.xlabel("k")
plt.ylabel("Trustworthiness")
plt.title("Trustworthiness of the different embeddings (features: human angles)")
plt.legend(loc = [1.01, 0.2], fontsize = 10)
plt.show()

In [None]:
plt.figure(figsize=(10, 5)) 

for i, result in enumerate(features_embeddings):
    plt.plot(ks, result["continuity"], 
             label = f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", 
             marker = "x", 
             linestyle = linestyles[result["reducer"].__name__])

plt.xlabel("k")
plt.ylabel("Continuity")
plt.title("Continuity of the different embeddings (features: human angles)")
plt.legend(loc = [1.01, 0.2], fontsize = 10)
plt.show()

### Random Triplet Accuracy

In [None]:
original_d = squareform(pdist(features, metric="euclidean"))
dists,knn = NearestNeighbors(n_neighbors=len(features) - 1).fit(features).kneighbors()

for result in features_embeddings:
    embeddings_d = squareform(pdist(result["embeddings"], metric="euclidean"))
    result["triplet_accuracy_local"] = random_triplet_accuracy(knn, original_d, embeddings_d, sampling = "local", n_repetitions=100)
    result["triplet_accuracy_mixed"] = random_triplet_accuracy(knn, original_d, embeddings_d, sampling = "mixed", n_repetitions=100)
    result["triplet_accuracy_global"] = random_triplet_accuracy(knn, original_d, embeddings_d, sampling = "global", n_repetitions=100)

In [None]:
labels = [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in features_embeddings]

fig, axs = plt.subplots(3, 1, figsize=(20, 15))
for i, sampling in enumerate(["local", "mixed", "global"]):
    for j, result in enumerate(features_embeddings):
        acc, std = result[f"triplet_accuracy_{sampling}"]
        axs[i].errorbar(j, acc, yerr = std, fmt = "o", color = "black")
    axs[i].set_ylabel("Accuracy")
    axs[i].set_title(f"Random triplet accuracy ({sampling} sampling)")
    axs[i].set_xticks(range(len(features_embeddings)), labels, rotation=0)
plt.tight_layout()
plt.show()

### Pearson Correlation Coefficient (PCC)

In [None]:
for result in features_embeddings:
    result["pcc"] = compute_pcc(features, result["embeddings"])

In [None]:
plt.figure(figsize=(20, 5))
for i,result in enumerate(features_embeddings):
    plt.errorbar(i, result["pcc"][0], yerr=result["pcc"][1], fmt="x", color = "black")
plt.xticks(range(len(features_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in features_embeddings], rotation=0)
plt.ylabel("PCC")
plt.title("Pearson Correlation Coefficient (PCC) between the clusters in the high and low dimensional spaces")
plt.show()

### Global Score (GS)

In [None]:
for result in features_embeddings:
    result["global_score"] = global_score(features, result["embeddings"])

In [None]:
plt.figure(figsize=(20, 5))
for i,result in enumerate(features_embeddings):
    plt.bar(i, result["global_score"], color = "black")
plt.xticks(range(len(features_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in features_embeddings], rotation=0)
plt.ylabel("GS")
plt.title("Global Score (GS) of the embeddings")
plt.show()

# Clustering

In [None]:
for result in features_embeddings:
    hdscan = HDBSCAN(min_cluster_size=10, min_samples=10, metric="euclidean").fit(result["embeddings"])
    result["clusters_labels"] = hdscan.labels_
    result["clusters_probs"] = hdscan.probabilities_

In [None]:
def visualize_clusters(embeddings_results, fig_title, d = 4):
    n_plots = len(embeddings_results)
    n_cols = 4
    n_rows = int(np.ceil(n_plots / n_cols))
    
    fig, axs = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * d, n_rows * d))
    axs = axs.flatten()
    for i, result in enumerate(embeddings_results):
        coords = result["embeddings"]
        reducer = result["reducer"]
        params = result["reducer_params"]
        labels = result["clusters_labels"]
        probs = result["clusters_probs"]
        
        for cluster in np.unique(labels):
            cluster_mask = labels == cluster
            cluster_coords = coords[cluster_mask]
            cluster_probs = np.clip(probs[cluster_mask], 0.1, 1)
            alpha = 0.5 if cluster != -1 else 0.1
            axs[i].scatter(cluster_coords[:,0], cluster_coords[:,1], s = cluster_probs, alpha = alpha, label=f"Cluster {cluster}")

        axs[i].set_xticks([])
        axs[i].set_yticks([])
        title = f"{reducer.__name__} - params: {format_params(params)}"
        axs[i].set_title(tw.fill(title, width = 40), fontsize=10)
    [axs[i].axis("off") for i in range(n_plots, n_rows * n_cols)]
    plt.suptitle(fig_title)
    plt.tight_layout()
    plt.show()

In [None]:
visualize_clusters(features_embeddings, fig_title = "HDSCAN clustering")

In [None]:
plt.figure(figsize=(20, 5))

for i,result in enumerate(features_embeddings):
    mean_prob = np.mean(result["clusters_probs"])
    std_prob = np.std(result["clusters_probs"])
    plt.errorbar(i, mean_prob, yerr = std_prob, fmt = "o", color = "black")
plt.xticks(range(len(features_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in features_embeddings], rotation=0)
plt.ylabel("Mean cluster probability")
plt.title("Mean cluster probability of the embeddings")
plt.show()    

In [None]:
def compute_clusters_distances(data_high_dim, cluster_labels):

    intra_cluster_d = []
    for cluster_id in np.unique(cluster_labels):
        cluster_points = data_high_dim[cluster_labels == cluster_id]
        if len(cluster_points) > 1:  # Ensure there's more than one point in the cluster
            cluster_pair_d = pairwise_distances(cluster_points, metric = "cosine")
            cluster_pair_d = np.triu(cluster_pair_d, k=1)
            mean_d = np.mean(cluster_pair_d)
            std_d = np.std(cluster_pair_d)
            intra_cluster_d.append((mean_d, std_d))

    between_cluster_d = []
    for i, cluster_id1 in enumerate(np.unique(cluster_labels)):
        for cluster_id2 in np.unique(cluster_labels):
            if cluster_id1 != cluster_id2:
                cluster1_points = data_high_dim[cluster_labels == cluster_id1]
                cluster2_points = data_high_dim[cluster_labels == cluster_id2]
                ds = pairwise_distances(cluster1_points, cluster2_points, metric = "cosine")
                mean_d = np.mean(ds)
                std_d = np.std(ds)
                between_cluster_d.append((mean_d, std_d))

    return intra_cluster_d, between_cluster_d

In [None]:
for result in features_embeddings:
    intra_cluster_d, between_cluster_d = compute_clusters_distances(features, result["clusters_labels"])
    result["intra_cluster_d"] = intra_cluster_d
    result["between_cluster_d"] = between_cluster_d

In [None]:
plt.figure(figsize=(20, 5))
plt.boxplot([[d[0] for d in result["intra_cluster_d"]] for result in features_embeddings], positions = range(len(features_embeddings)), showfliers=False)
plt.xticks(range(len(features_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in features_embeddings], rotation=0)
plt.ylabel("Mean intra-cluster distance")
plt.title("Intra-cluster distances")
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
plt.boxplot([[d[0] for d in result["between_cluster_d"]] for result in features_embeddings], positions = range(len(features_embeddings)), showfliers=False)
plt.xticks(range(len(features_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in features_embeddings], rotation=0)
plt.ylabel("Mean between-cluster distance")
plt.title("Between-cluster distances")
plt.show()

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

metric = "euclidean"
for result in features_embeddings:
    labels = result["clusters_labels"]
    if len(np.unique(labels)) == 1:
        result["silhouette_score"] = 0
        result["davies_bouldin_score"] = 0
        result["calinski_harabasz_score"] = 0
    else:
        result["silhouette_score"] = silhouette_score(result["embeddings"], labels, metric = metric)
        result["davies_bouldin_score"] = davies_bouldin_score(result["embeddings"], labels)
        result["calinski_harabasz_score"] = calinski_harabasz_score(result["embeddings"], labels)

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(20, 15))
for i, score in enumerate(["silhouette_score", "davies_bouldin_score", "calinski_harabasz_score"]):
    scores = [result[score] for result in features_embeddings]
    axs[i].bar(range(len(features_embeddings)), scores)
    axs[i].set_xticks(range(len(features_embeddings)))
    axs[i].set_xticklabels([tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in features_embeddings], rotation=0)
    axs[i].set_title(score)
plt.tight_layout()
plt.show()