In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
import os
import random
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import cv2
from numba import jit
from ast import literal_eval

from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import pairwise_distances
from hdbscan import HDBSCAN
from umap import UMAP
from umap.umap_ import nearest_neighbors

from emv.features.pose import load_poses 
from emv.features.pose_utils import draw_pose, CONNECTIONS, KEYPOINTS_NAMES, ANGLES_ASSOCIATIONS
from emv.features.pose_utils import compute_hips_angles, normalize_angles

from bokeh.io import output_notebook, show
from bokeh.layouts import layout, column, row
from bokeh.models.widgets import Div
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource
from bokeh import palettes

output_notebook()

In [None]:
local_poses_path = "data/sample_pose_df.csv"
pose_df = load_poses(local_poses_path, filter_poses={})

In [None]:
pose_df.head(2)

In [None]:
pose_df = pose_df[pose_df.keypoints.map(lambda x: x[7][2] > 0.5 and x[8][2] > 0.5)]

# Computing features

In [None]:
pose_df["hips_angles"] = pose_df.keypoints.map(lambda x: compute_hips_angles(x)[0])
pose_df["hips_angles"] = pose_df["hips_angles"].map(lambda x: normalize_angles(x))

In [None]:
angles = pd.DataFrame(pose_df.angle_vec.tolist(), columns = ANGLES_ASSOCIATIONS.keys())

default_angles = []
for angle in ANGLES_ASSOCIATIONS.keys():
    non_missing_angles = angles[angles[angle] != 0][angle]
    default_angles.append(non_missing_angles.mean())
    
random_size = 0.0001
pose_df["angle_vec"] = pose_df.angle_vec.map(lambda x: [a if a != 0 else default_angles[i] + random.random() * random_size for i,a in enumerate(x)])

In [None]:
def get_directions(keypoints):
    directions = []
    for connection in CONNECTIONS:
        kp1, kp2 = connection
        x1, y1, c1 = keypoints[KEYPOINTS_NAMES.index(kp1)]
        x2, y2, c2 = keypoints[KEYPOINTS_NAMES.index(kp2)]
        directions.append((x2 - x1, y2 - y1))
    directions = np.array(directions)
    
    return directions

In [None]:
pose_df["directions"] = pose_df.keypoints.apply(get_directions)
pose_df["norm_directions"] = pose_df.directions.apply(lambda x: x.flatten() / np.linalg.norm(x.flatten()))
pose_df["feature_angle_dir"] = pose_df.apply(lambda df: np.concatenate([df.norm_directions, df.angle_vec]), axis = 1)

# Identifying clusters of interesting poses

## Testing algorithms

In [None]:
sport = "Weightlifting"
sport_poses = pose_df[pose_df.sport == sport]
print(f"Testing with {len(sport_poses)} poses from {sport}.")

### Testing UMAP params

In [None]:
def compute_umap_embeddings(features, n_neighbors, min_dist = 0.01, metric = "cosine"):

    knn = nearest_neighbors(features, 
                            n_neighbors=np.max(n_neighbors), 
                            metric=metric,
                            metric_kwds={},
                            angular=False,
                            random_state=None)
    umap_embeddings = []
    for n in n_neighbors:
        reducer = umap.UMAP(n_neighbors=n, min_dist=min_dist, metric=metric, precomputed_knn=knn)
        embeddings = reducer.fit_transform(features)
        pairwise_d = pairwise_distances(embeddings, metric="euclidean")
        umap_embeddings.append({"n_neighbors": n, "embedding": embeddings, "pairwise_d": pairwise_d})
        
    return umap_embeddings
        
def plot_umap_embeddings(embeddings, d = 4, min_dist = 0.01):
    fig, axs = plt.subplots(nrows=1, ncols=len(n_neighbors), figsize=(len(n_neighbors) * d, d))
    for i, result in enumerate(embeddings):
        coords = result["embedding"]
        axs[i].scatter(coords[:,0], coords[:,1], s=0.1)
        axs[i].set_xticks([])
        axs[i].set_yticks([])
        axs[i].set_title(f"n_neighbors = {result['n_neighbors']}")
        if i == 0:
            axs[i].set_ylabel(f"min_dist = {min_dist}")
    plt.tight_layout()
    plt.show()

In [None]:
n_neighbors = [50, 100, 500, 1000]

human_angles_embeddings = compute_umap_embeddings(features = np.array(sport_poses["angle_vec"].tolist()), 
                                                  n_neighbors = n_neighbors)
hips_angles_embeddings = compute_umap_embeddings(features = np.array(sport_poses["hips_angles"].tolist()), 
                                                 n_neighbors = n_neighbors)

In [None]:
plot_umap_embeddings(human_angles_embeddings)
plot_umap_embeddings(hips_angles_embeddings)

### Testing HDBSCAN params

In [None]:
sport = "Rugby"
sport_poses = pose_df[pose_df.sport == sport]
embeddings = UMAP(n_neighbors=500, min_dist=0.01, metric='cosine').fit_transform(sport_poses["angle_vec"].tolist())
sport_poses["umap_x"] = embeddings[:,0]
sport_poses["umap_y"] = embeddings[:,1]

In [None]:
#embeddings = [x["embedding"] for x in umap_embeddings if x["n_neighbors"] == 500][0]

# Test HDBSCAN parameters
min_cluster_sizes = [5, 10, 15, 20]
min_samples = [5, 10, 15, 20]

hdbscan_clusterings = []
fig, axs = plt.subplots(nrows=len(min_cluster_sizes), ncols=len(min_samples), figsize=(len(min_samples) * 3, len(min_cluster_sizes) * 3))
for i, min_cluster_size in enumerate(min_cluster_sizes):
    for j, min_sample in enumerate(min_samples):
        hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_sample, allow_single_cluster=True)
        hdbscan.fit(embeddings)
        
        sport_poses.loc[:,"label"] = hdbscan.labels_
        sport_poses.loc[:,"probability"] = hdbscan.probabilities_
        
        hdbscan_clusterings.append({"min_cluster_size": min_cluster_size, "min_sample": min_sample, "labels": hdbscan.labels_, "probabilities": hdbscan.probabilities_})
        
        for label in sport_poses["label"].unique():
            axs[i,j].scatter(sport_poses[sport_poses.label == label]["umap_x"], 
                             sport_poses[sport_poses.label == label]["umap_y"], 
                             s=1, label=label, alpha = sport_poses[sport_poses.label == label]["probability"].map(lambda x: np.clip(x, 0.1, 1)))
        axs[i,j].set_xlabel(f"Found {len(sport_poses['label'].unique())} clusters", fontsize=10)
        axs[i,j].set_xticks([])
        axs[i,j].set_yticks([])
        if j == 0:
            axs[i,j].set_ylabel(f"min_cluster_size = {min_cluster_size}")
        if i == 0:
            axs[i,j].set_title(f"min_samples = {min_sample}")
plt.tight_layout()
plt.show()

In [None]:
hdbscan_clusterings = pd.DataFrame(hdbscan_clusterings)
hdbscan_clusterings["n_clusters"] = hdbscan_clusterings.labels.map(lambda x: len(np.unique(x)))
hdbscan_clusterings["mean_prob"] = hdbscan_clusterings.probabilities.map(lambda x: np.mean(x))
hdbscan_clusterings["std_prob"] = hdbscan_clusterings.probabilities.map(lambda x: np.std(x))

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
sns.heatmap(hdbscan_clusterings.pivot(columns = "min_cluster_size", index = "min_sample", values = "n_clusters"), annot=True, fmt="d", ax=axs[0])
axs[0].set_title("Number of clusters")
sns.heatmap(hdbscan_clusterings.pivot(columns = "min_cluster_size", index = "min_sample", values = "mean_prob"), annot=True, ax=axs[1])
axs[1].set_title("Mean probability")
sns.heatmap(hdbscan_clusterings.pivot(columns = "min_cluster_size", index = "min_sample", values = "std_prob"), annot=True, ax=axs[2])
axs[2].set_title("Std probability")

plt.tight_layout()
plt.show()

## By sport

In [None]:
def get_representative_poses(pose_df, sport, feature, reducer = UMAP(n_components=2, n_neighbors=1000, min_dist=0.1, metric="cosine"), clusterer = DBSCAN(eps=0.2, min_samples=10)):
    sport_poses = pose_df[pose_df.sport == sport]
    
    embeddings = reducer.fit_transform(sport_poses[feature].tolist())

    sport_poses.loc[:,"umap_x"] = embeddings[:,0]
    sport_poses.loc[:,"umap_y"] = embeddings[:,1]
    
    clusterer.fit(embeddings)

    sport_poses.loc[:,"label"] = clusterer.labels_
    sport_poses.loc[:,"probability"] = clusterer.probabilities_
    
    p1 = figure(title="UMAP", width=500, height=500)

    palette = palettes.Turbo256
    labels = sport_poses["label"].unique()
    label_colors = {label:palette[int(256 * (label + 1) / (len(labels) + 1))] for label in labels}

    representative_poses = []
    for label in labels:
        p1.scatter(sport_poses[sport_poses.label == label]["umap_x"], sport_poses[sport_poses.label == label]["umap_y"], 
                alpha = sport_poses[sport_poses.label == label]["probability"].map(lambda x: np.clip(x, 0.1, 1)),
                color = label_colors[label], size = 1)
        if label != -1:
            representative_pose = sport_poses.iloc[np.argmin(np.linalg.norm(sport_poses[["umap_x", "umap_y"]].values - sport_poses[sport_poses.label == label][["umap_x", "umap_y"]].mean().values, axis=1))]
            representative_poses.append(representative_pose)
            
    representative_poses = pd.concat([p.to_frame().T for p in representative_poses])      
    p1.scatter("umap_x", "umap_y", color="red", alpha = 1, size=12, marker = "x", source = ColumnDataSource(representative_poses))
    
    # Hover info for the representative poses cluster label
    hover = HoverTool(tooltips=[("Cluster", "@label")])
    p1.add_tools(hover)

    p2 = figure(title=f"Items not assigned: {len(sport_poses[sport_poses.label == -1])}", width=700, height=250)
    p2.vbar(x=sport_poses[sport_poses.label != -1].label.value_counts().index, top=sport_poses[sport_poses.label != -1].label.value_counts().values, width=0.9, color = "blue")
    p2.xaxis.ticker = sport_poses[sport_poses.label != -1].label.value_counts().index

    p3 = figure(title="Mean probability per cluster", width=700, height=250)
    mean_probs = sport_poses[sport_poses.label != -1].groupby("label").probability.mean()
    mean_stds = sport_poses[sport_poses.label != -1].groupby("label").probability.std()
    p3.circle(mean_probs.index, mean_probs.values, size=10, color="blue", alpha=0.5)
    err = list(zip((mean_probs - mean_stds).values, (mean_probs + mean_stds).values))
    p3.multi_line(list(zip(mean_probs.index, mean_probs.index)), err, color='blue')
    p3.xaxis.ticker = sport_poses[sport_poses.label != -1].label.value_counts().index

    title = Div(text=f"<h1>Representative poses for {sport}</h1>")
    layout = column([title, row([p1, column(p2, p3)])])
    show(layout)
    
    return representative_poses, sport_poses

def plot_representative_poses(representative_poses, ncols = 6):
    nrows = np.ceil(len(representative_poses) / ncols).astype(int)

    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 3, nrows * 3))
    axs = axs.flatten()
    for i, pose in enumerate(representative_poses):
        draw_pose(pose, ax = axs[i], cut = True, show_frame=True)
        axs[i].set_title(pose["label"])
    [ax.set_axis_off() for ax in axs[i+1:]]
    fig.suptitle(f"Representative poses for {sport}")
    plt.show()

In [None]:
def inspect_clusters(sport_poses, save = False):
    clusters = sorted(sport_poses.label.unique())
    ncols = 6
    nrows = len(clusters) - 1
    
    sport_poses["distance_to_cluster_centre"] = sport_poses.apply(lambda x: np.linalg.norm(x[["umap_x", "umap_y"]] - sport_poses[sport_poses.label == x.label][["umap_x", "umap_y"]].mean().values), axis=1)
    
    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 3, nrows * 3))
    axs = axs.flatten()
    for k in clusters:
        if k == -1:
            continue
        sample_poses = sport_poses[sport_poses.label == k].sort_values("distance_to_cluster_centre").head(ncols).reset_index(drop=True)
        for j, pose in sample_poses.iterrows():
            draw_pose(pose, ax = axs[k * ncols + j], cut = True, show_frame=True)
            axs[k * ncols + j].set_title(pose["label"])
    
    if save:
        plt.savefig(f"data/ioc_clusters/clusters_{sport}.png")
        plt.close()
    else:
        plt.show()

In [None]:
pose_df_clustered = pd.DataFrame()
representative_poses_per_sport = []
for sport in pose_df.sport.unique():
    print(f"Testing with {sport}")
    representative_poses, sport_poses = get_representative_poses(pose_df = pose_df,
                                                             sport = sport, 
                                                             feature = "angle_vec",
                                                             reducer = UMAP(n_components=2, n_neighbors=500, min_dist=0.01, metric="cosine"), 
                                                             clusterer = HDBSCAN(min_cluster_size=50, min_samples=50))
    pose_df_clustered = pd.concat([pose_df_clustered, sport_poses])
    representative_poses_per_sport.append({"sport": sport, "representative_poses": representative_poses})

In [None]:
clusters = sorted(sport_poses.label.unique())
ncols = 2
nrows = np.ceil((len(clusters) - 1) / ncols).astype(int)

spread_threshold = 0.1

fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 6, nrows * 2))
axs = axs.flatten()
for k in clusters:
    if k == -1:
        continue
    angles = pd.DataFrame(sport_poses[sport_poses.label == k].angle_vec.tolist(), columns = ANGLES_ASSOCIATIONS.keys())
    for j,angle in enumerate(angles.columns):
        mean_angle = angles[angle].mean()
        std_angle = angles[angle].std()
        axs[k].errorbar(j, mean_angle, std_angle, fmt='o', color = "black" if std_angle < spread_threshold else "red")
    axs[k].set_xticks(range(len(angles.columns)), angles.columns, rotation=0, fontsize = 6)
    axs[k].set_title(f"Cluster {k}", fontsize = 10)
[ax.set_axis_off() for ax in axs[k+1:]]
plt.tight_layout()
plt.show()

In [None]:
for sport in pose_df_clustered.sport.unique():
    print(f"Inspecting {sport}")
    inspect_clusters(pose_df_clustered[pose_df_clustered.sport == sport], save = True)

# Poses to keep

In [None]:
pose_df_clustered = pd.read_csv("data/sample_pose_df_clustered.csv", converters={"keypoints": literal_eval, "angle_vec": literal_eval, "angle_vec": literal_eval, "bbox": literal_eval})

In [None]:
clusters_to_keep = {
    'Archery':[0, 1, 2, 3, 10, 11, 14], 
    'Athletics':[1, 5, 9, 10, 11, 16, 17, 18, 20, 23], 
    'Badminton':[32, 33, 45, 47], 
    'Baseball':[0, 1, 2, 3, 4, 5, 6, 11, 23], 
    'Basketball':[5, 6, 8, 12, 13, 14, 15, 16, 19, 20, 21, 22, 23, 26, 27, 28, 29],
    'Boxing':[2, 3, 4, 6, 16, 17], 
    'Canoeing':[], # No interesting clusters
    'Cycling':[0, 6, 7, 18, 19, 20, 21, 22, 23, 24, 25], 
    'Diving':[0, 1, 3, 4, 19, 25, 26], 
    'Fencing':[1, 2, 7, 8, 10], 
    'Football':[0, 1, 2, 3, 4, 5, 6, 7, 8], # All clusters but poses are not actually interesting
    'Golf':[0, 3, 4, 8, 9, 11, 12, 15, 16, 17, 18], 
    'Judo':[3, 12, 15, 20], 
    'Rowing':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, 17, 18], 
    'Sailing':[2, 10, 32, 33, 36], 
    'Shooting':[11, 13, 32], 
    'Sport Climbing':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19],
    'Surfing':[6], 
    'Swimming':[0], 
    'Table Tennis':[8, 10, 11, 12, 13, 16, 17, 19, 20, 21, 22, 25, 26, 32, 33], 
    'Taekwondo':[], # No interesting clusters / Not working
    'Tennis':[1, 2, 6, 21, 25],
    'Weightlifting':[0, 1, 2, 15, 19, 20], 
    'Wrestling':[1, 6], 
    'Skateboarding':[8, 9, 14, 24, 27, 28, 29], 
    'Gymnastics':[0, 1, 2, 3, 4, 27],
    'Equestrian':[15, 19, 20, 21, 29, 30, 31, 32, 33, 34, 38, 38, 39, 40] # Athletes on horse but poses are not actually interesting
}

In [None]:
poses_to_keep = pd.DataFrame()
prop_poses_kept_per_sport = {}
for sport, clusters in clusters_to_keep.items():
    sport_poses = pose_df_clustered[pose_df_clustered.sport == sport]
    sport_poses_to_keep = sport_poses[sport_poses.label.isin(clusters)]
    poses_to_keep = pd.concat([poses_to_keep, sport_poses_to_keep])
    prop_poses_kept_per_sport[sport] = len(sport_poses_to_keep) / len(sport_poses)

print(f"Keeping {len(poses_to_keep)} poses in total from {len(pose_df_clustered)}.")

In [None]:
plt.figure(figsize=(15,5))
plt.bar(range(len(prop_poses_kept_per_sport)), prop_poses_kept_per_sport.values())
plt.xticks(range(len(prop_poses_kept_per_sport)), prop_poses_kept_per_sport.keys(), rotation=45)
plt.title("Proportion of poses kept per sport")
plt.show()

In [None]:
clustering_probs_means = poses_to_keep.groupby("sport").probability.mean()
clustering_probs_stds = poses_to_keep.groupby("sport").probability.std()

plt.figure(figsize=(15,5))
plt.errorbar(clustering_probs_means.index, clustering_probs_means, clustering_probs_stds, fmt='o')
plt.title("Mean probability of HDBSCAN clustering per sport")
plt.xticks(rotation=45)
plt.show()

In [None]:
poses_to_keep[['guid', 'media_id', 'frame_number', 'sport', 'angle_score', 'keypoints', 'bbox', 'angle_vec']].to_csv("data/sample_poses_to_keep.csv", index = False)

In [None]:
poses_to_keep = pd.read_csv("data/sample_poses_to_keep.csv", converters={"keypoints": literal_eval, "bbox": literal_eval, "angle_vec": literal_eval})

In [None]:
n_neighbors = [50, 100, 500, 1000]
min_dist = 0.01
metric = "cosine"
feature = "angle_vec"

features = np.array(poses_to_keep[feature].tolist())
knn = nearest_neighbors(features, 
                        n_neighbors=np.max(n_neighbors), 
                        metric=metric,
                        metric_kwds={},
                        angular=False,
                        random_state=None)

In [None]:
umap_embeddings = []
for n in n_neighbors:
    reducer = umap.UMAP(n_neighbors=n, min_dist=min_dist, metric=metric, precomputed_knn=knn)
    embeddings = reducer.fit_transform(features)
    umap_embeddings.append({"n_neighbors": n, "embedding": embeddings})

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=len(n_neighbors), figsize=(len(n_neighbors) * 4, 4))
for i, result in enumerate(umap_embeddings):
    embeddings = result["embedding"]
    axs[i].scatter(embeddings[:,0], embeddings[:,1], s=0.01)
    axs[i].set_xticks([])
    axs[i].set_yticks([])
    axs[i].set_title(f"n_neighbors = {result['n_neighbors']}")
    if i == 0:
        axs[i].set_ylabel(f"min_dist = {min_dist}")
plt.tight_layout()
plt.show()

In [None]:
embedding = [x["embedding"] for x in umap_embeddings if x["n_neighbors"] == 1000][0]
poses_to_keep["umap_x"] = embedding[:,0]
poses_to_keep["umap_y"] = embedding[:,1]

plt.figure(figsize=(10,10))
for sport in poses_to_keep.sport.unique():
    sport_p = poses_to_keep[poses_to_keep.sport == sport]
    plt.scatter(sport_p["umap_x"], sport_p["umap_y"], s=1, label=sport, alpha = 0.5)
plt.axis('off')
plt.show()

In [None]:
umap_model = UMAP(n_neighbors=1000, min_dist=0.01, metric='cosine')
embeddings = umap_model.fit_transform(poses_to_keep["angle_vec"].tolist())

In [None]:
palette = palettes.Turbo256
indices = np.linspace(0, len(palette) - 1, len(poses_to_keep.sport.unique()), dtype=int)
palette = [palette[i] for i in indices]
palette = {sport:color for sport,color in zip(poses_to_keep.sport.unique(), palette)}
poses_to_keep["color"] = poses_to_keep.sport.map(palette)

data = ColumnDataSource(data={"x": embeddings[:,0], "y": embeddings[:,1], "sport": poses_to_keep.sport, "color": poses_to_keep.color})

p = figure(title="UMAP", width=800, height=800)
p.scatter("x", "y", color="color", source=data, legend_group="sport", alpha=0.5, size=1)
hover = HoverTool(tooltips=[("Sport", "@sport")])
p.add_tools(hover)

show(p)