In [None]:
%cd ../..

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import cv2
from numba import jit

from sklearn.cluster import DBSCAN, KMeans
from hdbscan import HDBSCAN
from umap import UMAP

from emv.features.pose import load_poses 
from emv.features.pose_utils import draw_pose, CONNECTIONS, KEYPOINTS_NAMES, ANGLES_ASSOCIATIONS

from bokeh.io import output_notebook, show
from bokeh.layouts import layout, column, row
from bokeh.plotting import figure
from bokeh import palettes

output_notebook()

In [None]:
local_poses_path = "data/sample_pose_df.csv"
pose_df = load_poses(local_poses_path, filter_poses={})

In [None]:
pose_df.head(2)

In [None]:
angles = pd.DataFrame(pose_df.angle_vec.tolist(), columns = ANGLES_ASSOCIATIONS.keys())

default_angles = []
for angle in ANGLES_ASSOCIATIONS.keys():
    non_missing_angles = angles[angles[angle] != 0][angle]
    default_angles.append(non_missing_angles.mean())
    
random_size = 0.0001
pose_df["angle_vec_fix"] = pose_df.angle_vec.map(lambda x: [a if a != 0 else default_angles[i] + random.random() * random_size for i,a in enumerate(x)])

In [None]:
# Plot mean angles before fix (including missing values), before fix (excluding missing values), and after fix
width = 0.2
r = np.arange(len(ANGLES_ASSOCIATIONS))
r1 = [x - width for x in r]
r2 = [x + width for x in r]

plt.figure(figsize=(15,5))
plt.bar(r, angles.mean(), width = width, label="Before fix (including missing values)")
plt.bar(r1, angles.replace(0, np.nan).mean(), width = width, label="Before fix (excluding missing values)")
plt.bar(r2, pd.DataFrame(pose_df.angle_vec_fix.tolist(), columns = ANGLES_ASSOCIATIONS.keys()).mean(), width = width, label="After fix")
plt.xticks(r, ANGLES_ASSOCIATIONS.keys(), rotation=0)
plt.legend()
plt.show()

In [None]:
def get_directions(keypoints):
    directions = []
    for connection in CONNECTIONS:
        kp1, kp2 = connection
        x1, y1, c1 = keypoints[KEYPOINTS_NAMES.index(kp1)]
        x2, y2, c2 = keypoints[KEYPOINTS_NAMES.index(kp2)]
        directions.append((x2 - x1, y2 - y1))
    directions = np.array(directions)
    
    return directions

In [None]:
pose_df["directions"] = pose_df.keypoints.apply(get_directions)
pose_df["norm_directions"] = pose_df.directions.apply(lambda x: x.flatten() / np.linalg.norm(x.flatten()))
pose_df["feature_angle_dir"] = pose_df.apply(lambda df: np.concatenate([df.norm_directions, df.angle_vec_fix]), axis = 1)

# Clustering

## All poses

In [None]:
reducer = umap.UMAP(n_neighbors=500, min_dist=0.1, metric='cosine')
embeddings = reducer.fit_transform(pose_df["angle_vec"].tolist())
pose_df["umap_x"] = embeddings[:,0]
pose_df["umap_y"] = embeddings[:,1]

In [None]:
def cluster_poses(algo, pose_df, plot=True):
    algo.fit(pose_df[["umap_x", "umap_y"]])
    pose_df["label"] = algo.labels_
    if plot:
        plt.figure(figsize=(10,10))
        for label in pose_df["label"].unique():
            plt.scatter(pose_df[pose_df.label == label]["umap_x"], 
                        pose_df[pose_df.label == label]["umap_y"], 
                        s=1, label=label)
        plt.axis('off')
        plt.show()

    return pose_df

In [None]:
kmeans = KMeans(n_clusters=25)
kmeans_poses = cluster_poses(kmeans, pose_df)

In [None]:
dbscan = DBSCAN(eps=0.2, min_samples=10)
dbscan_poses = cluster_poses(dbscan, pose_df)

In [None]:
# create a grid of subplots
nrows = 2
ncols = 6

for cluster in dbscan_poses["label"].unique()[:3]:
    print("Cluster", cluster)
    n_samples = dbscan_poses[dbscan_poses["label"] == cluster].shape[0]
    n_samples = min(n_samples, nrows * ncols)
    sample_poses = dbscan_poses[dbscan_poses["label"] == cluster].sample(n_samples).reset_index(drop=True)

    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 3, nrows * 3))
    axs = axs.flatten()
    # iterate over the poses and draw each pose in a subplot
    for i, pose in sample_poses.iterrows():
        draw_pose(pose, ax = axs[i], cut = True)

    plt.show()


## By sport

In [None]:
def get_representative_poses(pose_df, sport, feature, clusterer = DBSCAN(eps=0.2, min_samples=10), metric = "euclidean"):
    sport_poses = pose_df[pose_df.sport == sport]
    umap = UMAP(n_components=2, n_neighbors=1000, min_dist=0.1, metric=metric)
    embeddings = umap.fit_transform(sport_poses[feature].tolist())

    sport_poses.loc[:,"umap_x"] = embeddings[:,0]
    sport_poses.loc[:,"umap_y"] = embeddings[:,1]
    
    clusterer.fit(embeddings)

    sport_poses.loc[:,"label"] = clusterer.labels_
    sport_poses.loc[:,"probability"] = clusterer.probabilities_
    
    p1 = figure(title="UMAP", width=500, height=500)

    palette = palettes.Turbo256
    labels = sport_poses["label"].unique()
    label_colors = {label:palette[int(256 * (label + 1) / (len(labels) + 1))] for label in labels}

    representative_poses = []
    for label in labels:
        p1.scatter(sport_poses[sport_poses.label == label]["umap_x"], sport_poses[sport_poses.label == label]["umap_y"], 
                alpha = sport_poses[sport_poses.label == label]["probability"].map(lambda x: np.clip(x, 0.1, 1)),
                color = label_colors[label], size = 1)
        if label != -1:
            representative_pose = sport_poses.iloc[np.argmin(np.linalg.norm(sport_poses[["umap_x", "umap_y"]].values - sport_poses[sport_poses.label == label][["umap_x", "umap_y"]].mean().values, axis=1))]
            representative_poses.append(representative_pose)
            p1.scatter(representative_pose["umap_x"], representative_pose["umap_y"], color="red", alpha = 1, size=12, marker = "x")
    representative_poses = pd.concat([p.to_frame().T for p in representative_poses])        

    p2 = figure(title=f"Items not assigned: {len(sport_poses[sport_poses.label == -1])}", width=500, height=250)
    p2.vbar(x=sport_poses[sport_poses.label != -1].label.value_counts().index, top=sport_poses[sport_poses.label != -1].label.value_counts().values, width=0.9, color = "blue")
    p2.xaxis.ticker = sport_poses[sport_poses.label != -1].label.value_counts().index

    p3 = figure(title="Mean probability per cluster", width=500, height=250)
    mean_probs = sport_poses[sport_poses.label != -1].groupby("label").probability.mean()
    mean_stds = sport_poses[sport_poses.label != -1].groupby("label").probability.std()
    p3.circle(mean_probs.index, mean_probs.values, size=10, color="blue", alpha=0.5)
    err = list(zip((mean_probs - mean_stds).values, (mean_probs + mean_stds).values))
    p3.multi_line(list(zip(mean_probs.index, mean_probs.index)), err, color='blue')
    p3.xaxis.ticker = sport_poses[sport_poses.label != -1].label.value_counts().index

    layout = row([p1, column(p2, p3)])
    show(layout)
    
    return representative_poses, sport_poses

def plot_representative_poses(representative_poses, ncols = 6):
    nrows = np.ceil(len(representative_poses) / ncols).astype(int)

    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 3, nrows * 3))
    axs = axs.flatten()
    for i, pose in enumerate(representative_poses):
        draw_pose(pose, ax = axs[i], cut = True, show_frame=True)
        axs[i].set_title(pose["label"])
    [ax.set_axis_off() for ax in axs[i+1:]]
    fig.suptitle(f"Representative poses for {sport}")
    plt.show()

In [None]:
representative_poses, sport_poses = get_representative_poses(pose_df = pose_df,
                                                             sport = "Weightlifting", 
                                                             feature = "angle_vec_fix", 
                                                             clusterer = HDBSCAN(min_cluster_size=50, min_samples=10),
                                                             metric = "cosine")

In [None]:
def inspect_clusters(sport_poses):
    clusters = sorted(sport_poses.label.unique())
    ncols = 6
    nrows = len(clusters) - 1

    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 3, nrows * 3))
    axs = axs.flatten()
    for k in clusters:
        if k == -1:
            continue
        sample_poses = sport_poses[sport_poses["label"] == k]
        sample_poses = sample_poses[sample_poses.probability > 0.8]
        sample_poses = sample_poses.sample(np.min([ncols, len(sample_poses)])).reset_index(drop=True)
        for j, pose in sample_poses.iterrows():
            draw_pose(pose, ax = axs[k * ncols + j], cut = True, show_frame=True)
            axs[k * ncols + j].set_title(pose["label"])
    plt.show()

In [None]:
representative_poses, sport_poses = get_representative_poses(pose_df = pose_df,
                                                             sport = "Fencing", 
                                                             feature = "angle_vec_fix", 
                                                             clusterer = HDBSCAN(min_cluster_size=20, min_samples=50),
                                                             metric = "cosine")

In [None]:
inspect_clusters(sport_poses)

In [None]:
clusters = sorted(sport_poses.label.unique())
ncols = 2
nrows = np.ceil((len(clusters) - 1) / ncols).astype(int)

spread_threshold = 0.1

fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 6, nrows * 2))
axs = axs.flatten()
for k in clusters:
    if k == -1:
        continue
    angles = pd.DataFrame(sport_poses[sport_poses.label == k].angle_vec.tolist(), columns = ANGLES_ASSOCIATIONS.keys())
    for j,angle in enumerate(angles.columns):
        mean_angle = angles[angle].mean()
        std_angle = angles[angle].std()
        axs[k].errorbar(j, mean_angle, std_angle, fmt='o', color = "black" if std_angle < spread_threshold else "red")
    axs[k].set_xticks(range(len(angles.columns)), angles.columns, rotation=0, fontsize = 6)
    axs[k].set_title(f"Cluster {k}", fontsize = 10)
[ax.set_axis_off() for ax in axs[k+1:]]
plt.tight_layout()
plt.show()