In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
import os
import random
import textwrap as tw
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from ast import literal_eval
from scipy.spatial.distance import pdist, squareform

from emv.features.pose import load_poses 
from emv.features.pose_utils import draw_pose, CONNECTIONS, KEYPOINTS_NAMES, ANGLES_ASSOCIATIONS
from emv.features.pose_utils import compute_hips_angles, normalize_angles

# Clustering
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN, KMeans
from hdbscan import HDBSCAN

# DR
from umap import UMAP
from umap.umap_ import nearest_neighbors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from trimap import TRIMAP

# Metrics
from emv.embeddings.dr_eval import compute_embeddings, compute_umap_embeddings, plot_embeddings, format_params
from emv.embeddings.dr_eval import \
    compute_coranking_metrics, \
    random_triplet_accuracy, \
    compute_pcc, \
    global_score

In [None]:
local_poses_path = "data/sample_poses_to_keep.csv"
pose_df = load_poses(local_poses_path, filter_poses={})
pose_df["angle_vec"] = pose_df.angle_vec_fix.map(lambda x: literal_eval(x))

In [None]:
pose_df.sport.value_counts()

# Computing features

In [None]:
angles = pd.DataFrame(pose_df.angle_vec.tolist(), columns = ANGLES_ASSOCIATIONS.keys())

default_angles = []
for angle in ANGLES_ASSOCIATIONS.keys():
    non_missing_angles = angles[angles[angle] != 0][angle]
    default_angles.append(non_missing_angles.mean())

random_size = 0.0001
pose_df["angle_vec"] = pose_df.angle_vec.map(lambda x: [a if a != 0 else default_angles[i] + random.random() * random_size for i,a in enumerate(x)])

In [None]:
pose_df = pose_df[pose_df.keypoints.map(lambda x: x[7][2] > 0.5 and x[8][2] > 0.5)] # Keep only poses with both hips
pose_df["hips_angles"] = pose_df.keypoints.map(lambda x: compute_hips_angles(x)[0])
pose_df["hips_angles"] = pose_df["hips_angles"].map(lambda x: normalize_angles(x))

In [None]:
keypoints_names = [k for k in KEYPOINTS_NAMES if k != "left_hip" and k != "right_hip"]

hips_angles = pd.DataFrame(pose_df["hips_angles"].to_list(), columns = keypoints_names)
hips_angles_means = hips_angles.mean()
hips_angles_std = hips_angles.std()

plt.figure(figsize=(15, 5))
hips_angles.boxplot()
plt.title("Hips angles")
plt.show()

# Embedding

In [None]:
N_sample = 500
sport_poses = []
for sport in pose_df.sport.unique():
    n_poses_in_sport = len(pose_df[pose_df.sport == sport])
    if n_poses_in_sport < N_sample:
        sport_poses.append(pose_df[pose_df.sport == sport])
    else:
        sport_poses.append(pose_df[pose_df.sport == sport].sample(N_sample, random_state=42))
sport_poses = pd.concat(sport_poses)
print(f"Testing with {len(sport_poses)} poses.")

human_angles = np.array(sport_poses["angle_vec"].tolist())

In [None]:
# PCA embeddings
human_angles_embeddings = [compute_embeddings(features = human_angles, reducer = PCA, params = {"n_components": 2})]

# UMAP embeddings
n_neighbors = [50, 100, 500]
human_angles_embeddings.extend(compute_umap_embeddings(features = human_angles, n_neighbors = n_neighbors, min_dist = 0.1))

In [None]:
human_angles = np.array(sport_poses["angle_vec"].tolist())

# PCA embeddings
human_angles_embeddings = [compute_embeddings(features = human_angles, reducer = PCA, params = {"n_components": 2})]

# UMAP embeddings
n_neighbors = [50, 100, 500]
human_angles_embeddings.extend(compute_umap_embeddings(features = human_angles, n_neighbors = n_neighbors))

# TSNE embeddings
perps = [5, 10, 50, 100]
for perp in perps:
    human_angles_embeddings.append(compute_embeddings(features = human_angles, reducer = TSNE, params = {"n_components": 2, "metric": "cosine", "perplexity": perp}))
    
# TRIMAP embeddings
n_inliers_values = [10, 20, 50] # Ratio of 2:1:1 for n_inliers:n_outliers:n_random (as recommended in the paper)
for n in n_inliers_values:
    m = int(0.5 * n)
    human_angles_embeddings.append(compute_embeddings(features = human_angles, reducer = TRIMAP, params = {"n_inliers": n, "n_outliers": m, "n_random": m, "distance": "cosine"}))

In [None]:
plot_embeddings(human_angles_embeddings, fig_title = "Human angles embeddings")

### Plot with thumbnails

In [None]:
from io import BytesIO
from PIL import Image
import base64

from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from emv.features.pose_utils import CONNECTIONS, KEYPOINTS_NAMES

In [None]:
def draw_annotation_pose(pose, threshold = 0.1, linewidth = 5, color = "black", alpha = 1):
    fig = plt.figure(figsize=(4,4))
    
    keypoints = pose["keypoints"]
    plt.scatter([k[0] for k in keypoints if k[2] > threshold], 
                [k[1] for k in keypoints if k[2] > threshold], 
                s=10, color=color, alpha = alpha)
    for c in CONNECTIONS:
        k1 = keypoints[KEYPOINTS_NAMES.index(c[0])]
        k2 = keypoints[KEYPOINTS_NAMES.index(c[1])]
        if k1[2] > threshold and k2[2] > threshold:
            plt.plot([k1[0], k2[0]], 
                    [k1[1], k2[1]], 
                    linewidth=linewidth, color=color, alpha = alpha)
    
    bbox = pose["bbox"]
    plt.xlim(int(bbox[0]),int(bbox[0] + bbox[2]))
    plt.ylim(int(bbox[1] + bbox[3]), int(bbox[1]))
    
    plt.axis("off")
    plt.gca().patch.set_alpha(0)
    
    buffer = BytesIO()
    fig.savefig(buffer)
    buffer.seek(0)
    plt.close(fig)
    
    img = Image.open(buffer).resize((128,128), Image.Resampling.BICUBIC)
    
    img = img.convert("RGBA")  
    datas = img.getdata()
    newData = []
    for item in datas:
        if item[0] == 255 and item[1] == 255 and item[2] == 255:
            newData.append((255, 255, 255, 0))
        else:
            newData.append(item)
    img.putdata(newData)
    
    return img

In [None]:
sports = sport_poses.sport.unique()
colors = sns.color_palette("hsv", len(sports))
colors_map = {s: c for s,c in zip(sports, colors)}

In [None]:
#embeddings = compute_umap_embeddings(features = human_angles, n_neighbors = [500], min_dist = 0.5)[0]["embeddings"]
embeddings = human_angles_embeddings[-1]["embeddings"]

In [None]:
# Plot of the embedding colored by sport
plt.figure(figsize=(8,8))
plt.scatter(embeddings[:,0], embeddings[:,1], c = [colors_map[s] for s in sport_poses.sport], s = 0.1)
plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=s, markerfacecolor=colors_map[s], markersize=10) for s in sports], loc = [1.01, 0])
plt.show()

In [None]:
#sport_poses["embedding"] = embeddings.tolist()

n_rows = 4
n_cols = int(len(sports) / n_rows) + 1
d = 3

fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * d, n_rows * d))
axs = axs.flatten()
for i,sport in enumerate(sports):
    axs[i].scatter(embeddings[sport_poses.sport == sport,0], embeddings[sport_poses.sport == sport,1], c = colors_map[sport], s = 0.1)
    axs[i].set_title(sport)
    axs[i].set_xlim(-0.1,1.1)
    axs[i].set_ylim(-0.1,1.1)
    axs[i].set_xticks([])
    axs[i].set_yticks([])
[ax.set_visible(False) for ax in axs[len(sports):]]
plt.tight_layout()
plt.show()

In [None]:
# Show a maximum of N thumbnails on the plot
N = 1000
EVERY_N = int(len(sport_poses) / N) 
if EVERY_N < 1:
    EVERY_N = 1
    
sample_poses = sport_poses.iloc[::EVERY_N]

thumbnails = []
for i,pose in tqdm(sample_poses.iterrows()):
    #color = colors_map[pose["sport"]]
    img = draw_annotation_pose(pose, color = "black")
    thumbnails.append(img)
    

In [None]:
from emv.embeddings.dr_eval import plot_embeddings_with_images

#embeddings = human_angles_embeddings[3]["embeddings"][::EVERY_N]
sample_embeddings = embeddings[::EVERY_N]
plot_embeddings_with_images(sample_embeddings, thumbnails, zoom = 0.2, figsize = 20)

### Nearest Neighbors

In [None]:
k = 5
radius = 0.05

knn = NearestNeighbors(radius = radius)
knn.fit(sample_embeddings)
dists, ids = knn.radius_neighbors(sample_embeddings)

sample_poses["coords"] = list(sample_embeddings)
sample_poses["nearest_ids"] = list(ids)
sample_poses["nearest_sports"] = sample_poses.nearest_ids.map(lambda x: Counter(sample_poses.iloc[x].sport.tolist()).most_common())


for _ in range(10):
    fig, axs = plt.subplots(1, 3, figsize=(18, 6))
    random_pose = sample_poses.sample(1)
    
    draw_pose(random_pose.squeeze(axis = 0), ax = axs[0], show_frame = True)
    
    axs[1].scatter(sample_embeddings[:,0], sample_embeddings[:,1], s = 1)
    axs[1].scatter(random_pose["coords"].values[0][0], random_pose["coords"].values[0][1], s = 30, color = "red", marker = "x")
    axs[1].add_patch(plt.Circle((random_pose["coords"].values[0][0], random_pose["coords"].values[0][1]), radius, color='red', fill = False))

    axs[1].set_title(f"Embedded poses - Matched pose from {random_pose.sport.values[0]}")
    axs[1].set_xticks([])
    axs[1].set_yticks([])

    matches = random_pose["nearest_sports"].values[0][:k][::-1]
    total_matches = np.sum([m[1] for m in matches])
    axs[2].barh([m[0] for m in matches], [m[1] / total_matches for m in matches])
    axs[2].set_title("Top sports for the top100 nearest poses")

    plt.tight_layout()
    plt.show()

## Evaluation

### Co-ranking Metrics: Trustworthiness and Continuity

References:
* https://towardsdatascience.com/on-the-validating-umap-embeddings-2c8907588175
* https://github.com/MoritzM00/drcomp/tree/main

In [None]:
ks = [10, 50, 100, 500, 1000]
for result in human_angles_embeddings:
    t_values, c_values = compute_coranking_metrics(human_angles, result["embeddings"], ks = ks)
    result["trustworthiness"] = t_values
    result["continuity"] = c_values

In [None]:
linestyles = {"PCA": "-.", "TSNE": "--", "UMAP": ":", "TRIMAP": "-"}
plt.figure(figsize=(10, 5)) 

for i, result in enumerate(human_angles_embeddings):
    plt.plot(ks, result["trustworthiness"], 
             label = f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", 
             marker = "x", 
             linestyle = linestyles[result["reducer"].__name__])

plt.xlabel("k")
plt.ylabel("Trustworthiness")
plt.title("Trustworthiness of the different embeddings (features: human angles)")
plt.legend(loc = [1.01, 0.2], fontsize = 10)
plt.show()

In [None]:
plt.figure(figsize=(10, 5)) 

for i, result in enumerate(human_angles_embeddings):
    plt.plot(ks, result["continuity"], 
             label = f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", 
             marker = "x", 
             linestyle = linestyles[result["reducer"].__name__])

plt.xlabel("k")
plt.ylabel("Continuity")
plt.title("Continuity of the different embeddings (features: human angles)")
plt.legend(loc = [1.01, 0.2], fontsize = 10)
plt.show()

### Random Triplet Accuracy

In [None]:
original_d = squareform(pdist(human_angles, metric="euclidean"))
dists,knn = NearestNeighbors(n_neighbors=len(human_angles) - 1).fit(human_angles).kneighbors()

for result in human_angles_embeddings:
    embeddings_d = squareform(pdist(result["embeddings"], metric="euclidean"))
    result["triplet_accuracy_local"] = random_triplet_accuracy(knn, original_d, embeddings_d, sampling = "local", n_repetitions=100)
    result["triplet_accuracy_mixed"] = random_triplet_accuracy(knn, original_d, embeddings_d, sampling = "mixed", n_repetitions=100)
    result["triplet_accuracy_global"] = random_triplet_accuracy(knn, original_d, embeddings_d, sampling = "global", n_repetitions=100)

In [None]:
labels = [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in human_angles_embeddings]

fig, axs = plt.subplots(3, 1, figsize=(20, 15))
for i, sampling in enumerate(["local", "mixed", "global"]):
    for j, result in enumerate(human_angles_embeddings):
        acc, std = result[f"triplet_accuracy_{sampling}"]
        axs[i].errorbar(j, acc, yerr = std, fmt = "o", color = "black")
    axs[i].set_ylabel("Accuracy")
    axs[i].set_title(f"Random triplet accuracy ({sampling} sampling)")
    axs[i].set_xticks(range(len(human_angles_embeddings)), labels, rotation=0)
plt.tight_layout()
plt.show()

### Pearson Correlation Coefficient (PCC)

In [None]:
for result in human_angles_embeddings:
    result["pcc"] = compute_pcc(human_angles, result["embeddings"])

In [None]:
plt.figure(figsize=(20, 5))
for i,result in enumerate(human_angles_embeddings):
    plt.errorbar(i, result["pcc"][0], yerr=result["pcc"][1], fmt="x", color = "black")
plt.xticks(range(len(human_angles_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in human_angles_embeddings], rotation=0)
plt.ylabel("PCC")
plt.title("Pearson Correlation Coefficient (PCC) between the clusters in the high and low dimensional spaces")
plt.show()

### Global Score (GS)

In [None]:
for result in human_angles_embeddings:
    result["global_score"] = global_score(human_angles, result["embeddings"])

In [None]:
plt.figure(figsize=(20, 5))
for i,result in enumerate(human_angles_embeddings):
    plt.bar(i, result["global_score"], color = "black")
plt.xticks(range(len(human_angles_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in human_angles_embeddings], rotation=0)
plt.ylabel("GS")
plt.title("Global Score (GS) of the embeddings")
plt.show()

# Clustering

In [None]:
for result in human_angles_embeddings:
    hdscan = HDBSCAN(min_cluster_size=10, min_samples=10, metric="euclidean").fit(result["embeddings"])
    result["clusters_labels"] = hdscan.labels_
    result["clusters_probs"] = hdscan.probabilities_

In [None]:
def visualize_clusters(embeddings_results, fig_title, d = 4):
    n_plots = len(embeddings_results)
    n_cols = 4
    n_rows = int(np.ceil(n_plots / n_cols))
    
    fig, axs = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * d, n_rows * d))
    axs = axs.flatten()
    for i, result in enumerate(embeddings_results):
        coords = result["embeddings"]
        reducer = result["reducer"]
        params = result["reducer_params"]
        labels = result["clusters_labels"]
        probs = result["clusters_probs"]
        
        for cluster in np.unique(labels):
            cluster_mask = labels == cluster
            cluster_coords = coords[cluster_mask]
            cluster_probs = np.clip(probs[cluster_mask], 0.1, 1)
            alpha = 0.5 if cluster != -1 else 0.1
            axs[i].scatter(cluster_coords[:,0], cluster_coords[:,1], s = cluster_probs, alpha = alpha, label=f"Cluster {cluster}")

        axs[i].set_xticks([])
        axs[i].set_yticks([])
        title = f"{reducer.__name__} - params: {format_params(params)}"
        axs[i].set_title(tw.fill(title, width = 40), fontsize=10)
    [axs[i].axis("off") for i in range(n_plots, n_rows * n_cols)]
    plt.suptitle(fig_title)
    plt.tight_layout()
    plt.show()

In [None]:
visualize_clusters(human_angles_embeddings, fig_title = "HDSCAN clustering")

In [None]:
plt.figure(figsize=(20, 5))

for i,result in enumerate(human_angles_embeddings):
    mean_prob = np.mean(result["clusters_probs"])
    std_prob = np.std(result["clusters_probs"])
    plt.errorbar(i, mean_prob, yerr = std_prob, fmt = "o", color = "black")
plt.xticks(range(len(human_angles_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in human_angles_embeddings], rotation=0)
plt.ylabel("Mean cluster probability")
plt.title("Mean cluster probability of the embeddings")
plt.show()    

In [None]:
def compute_clusters_distances(data_high_dim, cluster_labels):

    intra_cluster_d = []
    for cluster_id in np.unique(cluster_labels):
        cluster_points = data_high_dim[cluster_labels == cluster_id]
        if len(cluster_points) > 1:  # Ensure there's more than one point in the cluster
            cluster_pair_d = pairwise_distances(cluster_points, metric = "cosine")
            cluster_pair_d = np.triu(cluster_pair_d, k=1)
            mean_d = np.mean(cluster_pair_d)
            std_d = np.std(cluster_pair_d)
            intra_cluster_d.append((mean_d, std_d))

    between_cluster_d = []
    for i, cluster_id1 in enumerate(np.unique(cluster_labels)):
        for cluster_id2 in np.unique(cluster_labels):
            if cluster_id1 != cluster_id2:
                cluster1_points = data_high_dim[cluster_labels == cluster_id1]
                cluster2_points = data_high_dim[cluster_labels == cluster_id2]
                ds = pairwise_distances(cluster1_points, cluster2_points, metric = "cosine")
                mean_d = np.mean(ds)
                std_d = np.std(ds)
                between_cluster_d.append((mean_d, std_d))

    return intra_cluster_d, between_cluster_d

In [None]:
for result in human_angles_embeddings:
    intra_cluster_d, between_cluster_d = compute_clusters_distances(human_angles, result["clusters_labels"])
    result["intra_cluster_d"] = intra_cluster_d
    result["between_cluster_d"] = between_cluster_d

In [None]:
plt.figure(figsize=(20, 5))
plt.boxplot([[d[0] for d in result["intra_cluster_d"]] for result in human_angles_embeddings], positions = range(len(human_angles_embeddings)), showfliers=False)
plt.xticks(range(len(human_angles_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in human_angles_embeddings], rotation=0)
plt.ylabel("Mean intra-cluster distance")
plt.title("Intra-cluster distances")
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
plt.boxplot([[d[0] for d in result["between_cluster_d"]] for result in human_angles_embeddings], positions = range(len(human_angles_embeddings)), showfliers=False)
plt.xticks(range(len(human_angles_embeddings)), [tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in human_angles_embeddings], rotation=0)
plt.ylabel("Mean between-cluster distance")
plt.title("Between-cluster distances")
plt.show()

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

metric = "euclidean"
for result in human_angles_embeddings:
    labels = result["clusters_labels"]
    if len(np.unique(labels)) == 1:
        result["silhouette_score"] = 0
        result["davies_bouldin_score"] = 0
        result["calinski_harabasz_score"] = 0
    else:
        result["silhouette_score"] = silhouette_score(result["embeddings"], labels, metric = metric)
        result["davies_bouldin_score"] = davies_bouldin_score(result["embeddings"], labels)
        result["calinski_harabasz_score"] = calinski_harabasz_score(result["embeddings"], labels)

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(20, 15))
for i, score in enumerate(["silhouette_score", "davies_bouldin_score", "calinski_harabasz_score"]):
    scores = [result[score] for result in human_angles_embeddings]
    axs[i].bar(range(len(human_angles_embeddings)), scores)
    axs[i].set_xticks(range(len(human_angles_embeddings)))
    axs[i].set_xticklabels([tw.fill(f"{result['reducer'].__name__} - {format_params(result['reducer_params'])}", width = 20) for result in human_angles_embeddings], rotation=0)
    axs[i].set_title(score)
plt.tight_layout()
plt.show()