# Create a projection from the IOC data

There a 3 components to a projection with the poses:
1. Features from the feature table (Feature Table)
2. A Projection (Projection table)
3. Mappings of the coordinates to the projection and features (MapProjectionFeature Table)

As the keypoints and angles are already provided in the associated feature data, the atlas is not needed to create the projection.

In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
from emv.db.dao import DataAccessObject
from emv.db.queries import get_features_by_type_paginated, count_features_by_type
from sqlalchemy.sql import text
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import numpy as np
from emv.api.models import Feature
from emv.api.models import Projection, MapProjectionFeatureCreate
from emv.db.queries import create_projection, create_map_projection_feature, create_feature, get_library_id_from_name, get_all_media_by_library_id, count_media_by_library_id, get_media_by_id
from umap import UMAP
import numba
from tqdm import tqdm
import random

#from emv.features.pose import load_poses 
from emv.client.get_content import get_features
from emv.utils import dataframe_from_hdf5
from emv.settings import DRIVE_PATH

In [None]:
@numba.njit(fastmath=True)
def cylinder_euclidean_grad(x, y, cylinder_dimension=2*np.pi, linear_dimension=1.0):
    """Euclidean distance and gradient for cylindrical projection.

    x, y: Points between which the distance and gradient are computed.
    cylinder_dimension: The dimension of the cylindrical wraparound (default 2*pi).
    linear_dimension: The linear dimension (default 1.0).
    """
    distance_sqr = 0.0
    g = np.zeros_like(x)
    
    # Cylindrical dimension (e.g., angular wraparound)
    a = abs(x[0] - y[0])
    if 2 * a < cylinder_dimension:
        distance_sqr += a ** 2
        g[0] = (x[0] - y[0])
    else:
        distance_sqr += (cylinder_dimension - a) ** 2
        g[0] = (x[0] - y[0]) * (a - cylinder_dimension) / a
    
    # Linear dimension (e.g., height)
    b = abs(x[1] - y[1])
    distance_sqr += b ** 2
    g[1] = (x[1] - y[1])
    
    distance = np.sqrt(distance_sqr)
    return distance, g / (1e-6 + distance)

# Add filtered poses as new features to the DB

In [None]:
local_poses_path = "data/sample_poses_to_keep.csv"
pose_df = load_poses(access_point="LOCAL", local_fp=local_poses_path, filter_poses={})
pose_df["angle_vec"] = pose_df.angle_vec_fix.map(lambda x: literal_eval(x))

In [None]:
pose_df = pose_df[pose_df.keypoints.map(lambda x: x[7][2] > 0.5 and x[8][2] > 0.5)]
print(pose_df.shape)

In [None]:
def centralize_and_scale_keypoints(keypoints, reference_points=['left_hip', 'right_hip'], target_diameter=1):

    keypoint_names = [
        'nose', 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
        'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', 'right_knee',
        'left_ankle', 'right_ankle'
    ]
    keypoints = np.array(keypoints)

    # Calculate the reference point (midpoint between hips)
    left_hip = keypoints[keypoint_names.index(reference_points[0])]
    right_hip = keypoints[keypoint_names.index(reference_points[1])]
    center_point = (left_hip + right_hip) / 2

    # Translate pose to center it at the origin
    translated_keypoints = keypoints - center_point

    # Determine the maximum distance from the center point to any keypoint
    max_distance = np.max(np.linalg.norm(translated_keypoints, axis=1)) 

    # Calculate the scale factor to make the maximum distance equal to 0.5
    scale_factor = 0.5 / max_distance

    # Scale keypoints to have the target diameter
    scaled_keypoints = translated_keypoints * scale_factor

    return scaled_keypoints

def get_angle_feature_vector(keypoints):
    def calculate_angle(points):
        assert len(
            points) == 3, "Three points are required to calculate the angles"

        hip1, hip2, ref = np.array(points)

        if (hip1[0] == ref[0] and hip1[1] == ref[1]) or \
                (hip2[0] == ref[0] and hip2[1] == ref[1]) or \
                (hip1[0] == hip2[0] and hip1[1] == hip2[1]):
            return np.nan, np.nan, np.nan

        # Calculate the lengths of the sides of the triangle
        a = np.linalg.norm(hip2 - ref)
        b = np.linalg.norm(hip1 - ref)
        c = np.linalg.norm(hip1 - hip2)

        # Law of cosines to find the angles
        angle_hip2_hip1_ref = np.degrees(
            np.arccos((b**2 + c**2 - a**2) / (2 * b * c)))
        angle_hip1_hip2_ref = np.degrees(
            np.arccos((a**2 + c**2 - b**2) / (2 * a * c)))
        angle_hip1_ref_hip2 = np.degrees(
            np.arccos((a**2 + b**2 - c**2) / (2 * a * b)))

        return angle_hip2_hip1_ref, angle_hip1_hip2_ref, angle_hip1_ref_hip2

    angles = []

    # Using indices for reference points, these should be the two hips.
    ref_indices = [7, 8]  # Indices of the reference points

    # Calculate angles for each keypoint relative to the two reference points
    for i, keypoint in enumerate(keypoints):
        if i not in ref_indices:
            angles.extend(calculate_angle(
                [keypoints[ref_indices[0]], keypoint, keypoints[ref_indices[1]]]))

    feature_vector = np.array(angles)
    return feature_vector / 180.0

In [None]:
pose_df["keypoints"] = pose_df.keypoints.map(lambda x: [list(k) for k in x])
pose_df["keypoints_norm"] = pose_df.keypoints.map(lambda x: centralize_and_scale_keypoints([k[:2] for k in x]))
pose_df["embedding_33"] = pose_df.keypoints_norm.map(lambda x: get_angle_feature_vector(x))

In [None]:
pose_df = pose_df[pose_df.embedding_33.map(lambda x: not np.isnan(x).any())] # Remove poses with NaN angles
print(pose_df.shape)

In [None]:
for i,row in pose_df.iloc.iterrows():
    feature = Feature(
        feature_type='pose_filtered',
        version="1",
        model_name='PifPafModel.fast',
        model_params={'PifPafModel': 'fast'},
        data={
            "frame":row["frame_number"],
            "sport":row["sport"],
            "keypoints":row["keypoints"],
            "keypoints_norm":row["keypoints_norm"].tolist()
            },
        media_id=row['media_id'], 
        embedding_size=33,
        embedding_33=row['embedding_33']
    )
    create_feature(feature)

# Load features from DB

## Load poses filtered

In [None]:
# Retrieve the transformed features for the poses, feature_type = 'pose_image' is a smaller dataset than just 'pose'
# In comparison to the full data, it has normalized keypoints and embeddings
query = text("SELECT * FROM feature WHERE feature_type = 'pose_filtered'")
df = pd.DataFrame(DataAccessObject().fetch_all(query))
df['embedding_33'] = df['embedding_33'].apply(lambda x: literal_eval(x))
df["sport"] = df.data.map(lambda x: x["sport"])

print(f"{df.shape[0]} poses retrieved")

N_sample = 500
category = "sport"
sample_df = []
for label in df[category].unique():
    n_poses_in_sport = len(df[df[category] == label])
    if n_poses_in_sport < N_sample:
        sample_df.append(df[df[category] == label])
    else:
        sample_df.append(df[df[category] == label].sample(N_sample, random_state=42))
sample_df = pd.concat(sample_df)
sample_df = sample_df.reset_index(drop=True)
print(f"Testing with {len(sample_df)} poses.")

In [None]:
df.head()

## Load all poses

In [None]:
def centralize_and_scale_keypoints(keypoints, reference_points=['left_hip', 'right_hip'], target_diameter=1):

    keypoint_names = [
        'nose', 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
        'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', 'right_knee',
        'left_ankle', 'right_ankle'
    ]
    keypoints = np.array(keypoints)

    # Calculate the reference point (midpoint between hips)
    left_hip = keypoints[keypoint_names.index(reference_points[0])]
    right_hip = keypoints[keypoint_names.index(reference_points[1])]
    center_point = (left_hip + right_hip) / 2

    # Translate pose to center it at the origin
    translated_keypoints = keypoints - center_point

    # Determine the maximum distance from the center point to any keypoint
    max_distance = np.max(np.linalg.norm(translated_keypoints, axis=1)) 

    # Calculate the scale factor to make the maximum distance equal to 0.5
    scale_factor = 0.5 / max_distance

    # Scale keypoints to have the target diameter
    scaled_keypoints = translated_keypoints * scale_factor

    return scaled_keypoints

def get_angle_feature_vector(keypoints):
    def calculate_angle(points):
        assert len(points) == 3, "Three points are required to calculate the angles"

        hip1, hip2, ref = np.array(points)

        if (hip1[0] == ref[0] and hip1[1] == ref[1]) or \
           (hip2[0] == ref[0] and hip2[1] == ref[1]) or \
           (hip1[0] == hip2[0] and hip1[1] == hip2[1]):
           return np.nan, np.nan, np.nan

        # Calculate the lengths of the sides of the triangle
        a = np.linalg.norm(hip2 - ref)
        b = np.linalg.norm(hip1 - ref)
        c = np.linalg.norm(hip1 - hip2)

        # Law of cosines to find the angles
        angle_hip2_hip1_ref = np.degrees(
            np.arccos((b**2 + c**2 - a**2) / (2 * b * c)))
        angle_hip1_hip2_ref = np.degrees(
            np.arccos((a**2 + c**2 - b**2) / (2 * a * c)))
        angle_hip1_ref_hip2 = np.degrees(
            np.arccos((a**2 + b**2 - c**2) / (2 * a * b)))

        return angle_hip2_hip1_ref, angle_hip1_hip2_ref, angle_hip1_ref_hip2

    angles = []

    # Using indices for reference points, these should be the two hips.
    ref_indices = [7, 8]  # Indices of the reference points

    # Calculate angles for each keypoint relative to the two reference points
    for i, keypoint in enumerate(keypoints):
        if i not in ref_indices:
            angles.extend(calculate_angle(
                [keypoints[ref_indices[0]], keypoint, keypoints[ref_indices[1]]]))

    feature_vector = np.array(angles)
    return feature_vector / 180.0

In [None]:
MAX_FEATURES = 10000
poses = get_features_by_type_paginated("pose", page_size=1000)

while len(poses) < MAX_FEATURES:
    last_seen_id = poses[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    poses.extend(get_features_by_type_paginated("pose", page_size=1000, last_seen_feature_id=last_seen_id))
    
print(f"Retrieved {len(poses)} instances")

In [None]:
df = pd.DataFrame(poses)
df["frames"] = df["data"].map(lambda x: x["frames"])
df = df.explode("frames")

df["frame"] = df["frames"].map(lambda x: x["frame"])
df["annotations"] = df["frames"].map(lambda x: x["data"]["annotations"])
df = df.explode("annotations")

df.dropna(subset = ["annotations"], inplace=True)

df["score"] = df["annotations"].map(lambda x: x["score"])

# Cumulative sum of the scores binned by 0.05
df.score.hist(bins=20, cumulative=True)
plt.title(f"Cumulative score distribution on {len(df)} poses")
plt.show()

In [None]:
df = df[df.score > 0.6]
metadata = pd.read_hdf("data/metadata.hdf5")
metadata["media_id"] = metadata.seq_id.map(lambda x: f"ioc-{x}")
df = df.merge(metadata[["media_id", "sport"]], on="media_id", how="left")

In [None]:
N_sample = 10000
category = "sport"
sample_df = []
for label in df[category].unique():
    n_poses_in_sport = len(df[df[category] == label])
    if n_poses_in_sport < N_sample:
        sample_df.append(df[df[category] == label])
    else:
        sample_df.append(df[df[category] == label].sample(N_sample, random_state=42))
sample_df = pd.concat(sample_df)
sample_df = sample_df.reset_index(drop=True)
df = sample_df
print(f"Testing with {len(df)} poses.")

In [None]:
df["bbox"] = df["annotations"].map(lambda x: x["bbox"])
df["keypoints"] = df["annotations"].map(lambda x: x["keypoints"])
df["keypoints"] = df["keypoints"].map(lambda x: [x[i:i+3] for i in range(0, len(x), 3)])
df["keypoints"] = df["keypoints"].map(lambda x: [k for i,k in enumerate(x) if i not in [1,2,3,4]]) # Remove eyes and ears
df["keypoints_norm"] = df["keypoints"].map(lambda x: centralize_and_scale_keypoints([k[:2] for k in x]))
df["embedding_33"] = df.keypoints_norm.map(lambda x: get_angle_feature_vector(x))
df = df[df.embedding_33.map(lambda x: not np.isnan(x).any())] # Remove poses with NaN angles

df = df[["media_id", "frame", "keypoints", "keypoints_norm", "embedding_33", "bbox", "score", "sport"]]

print(f"Retrieved {df.shape[0]} poses")

In [None]:
df.head(2)

In [None]:
df["feature_id"] = df.apply(lambda x: create_feature(Feature(
                                                        feature_type='pose_all',
                                                        version="1",
                                                        model_name='PifPafModel.fast',
                                                        model_params={'PifPafModel': 'fast'},
                                                        data={
                                                            "frame":x["frame"],
                                                            "sport":x["sport"],
                                                            "keypoints":x["keypoints"],
                                                            "keypoints_norm":x["keypoints_norm"].tolist()
                                                            },
                                                        media_id=x['media_id'], 
                                                        embedding_size=33,
                                                        embedding_33=x['embedding_33'].tolist()
                                                    )), axis=1)

In [None]:
df["feature_id"] = df["feature_id"].map(lambda x: x.get("feature_id", None))

# Create projection

In [None]:
# default values that assume an atlas, not necessary, 
# but it's good to give the correct values when creating the projection

total_tiles = len(df) # either all features or a subset of features
atlas_width = 4096
max_tile_size = 512
max_tiles_per_atlas = (atlas_width // max_tile_size) ** 2
atlas_count = int(total_tiles / max_tiles_per_atlas) + 1

## Cylindrical Projection

In [None]:
cylinder_mapper = UMAP(output_metric=cylinder_euclidean_grad, min_dist=0.1, n_neighbors=100, random_state=42)
data = np.array(df['embedding_33'].tolist())
embedding = cylinder_mapper.fit_transform(data)

In [None]:
color_palette = sns.color_palette("Set2", n_colors=len(df.sport.unique()))
colors = df.sport.map(lambda x: color_palette[list(df.sport.unique()).index(x)])

In [None]:
# Panorama dimensions
R_pano = 1
H_pano = 1

# Cylindrical dimension (theta) and height (h)
cylinder_dimension = 2 * np.pi
radius = R_pano  # Radius of the cylinder

# Extract the cylindrical (theta) and linear (h) coordinates
theta_coords = cylinder_mapper.embedding_[:, 0] % cylinder_dimension
h_coords = cylinder_mapper.embedding_[:, 1]
h_coords = H_pano * (h_coords - np.min(h_coords)) / (np.max(h_coords) - np.min(h_coords)) # Remap height to [0, H_pano] size of the Panorama

# Convert cylindrical coordinates to Cartesian coordinates
x = radius * np.cos(theta_coords)
y = radius * np.sin(theta_coords)
z = h_coords

embedding_cartesian = np.stack([x, y, z], axis=1)

fig = plt.figure(figsize=(20, 8))
ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(x, y, z, s = 0.1)
ax1.set_title("Cylindrical projection", fontweight = "bold")

ax2 = fig.add_subplot(122)
ax2.scatter(theta_coords, h_coords, s = 0.1)
ax2.set_title("Unwrapped cylinder", fontweight = "bold")

plt.tight_layout()
plt.show()

In [None]:
# Create the projection, replace the names with the desired ones, library_id = 2 is for the IOC
projection = Projection(
    projection_name="IOC Poses All Cylindrical UMAP 1",
    version="0.0.1",
    library_id=2,
    model_name="openpifpaf_fast",
    model_params={},
    data={},
    dimension=3,
    atlas_folder_path="",
    atlas_width=atlas_width,
    tile_size=max_tile_size,
    atlas_count=atlas_count,
    total_tiles=total_tiles,
    tiles_per_atlas=max_tiles_per_atlas,
)

projection_id = create_projection(projection)['projection_id']
print(f"Projection ID: {projection_id}")

In [None]:
# Create an entry in the map_projection_feature table for each feature, links features, media and coordinates
for i, row in df.iterrows():
    create_map_projection_feature(MapProjectionFeatureCreate(
        projection_id=projection_id,
        media_id=row.media_id,
        atlas_order=-1,
        index_in_atlas=-1,
        coordinates=[embedding_cartesian[i, 0], embedding_cartesian[i, 1], embedding_cartesian[i, 2]],
        feature_id=row.feature_id
    ))

## 3D Projection

In [None]:
threed_mapper = UMAP(n_components=3, min_dist=0.1, n_neighbors=100, random_state=42)
data = np.array(df['embedding_33'].tolist())
embedding_threed = threed_mapper.fit_transform(data)

In [None]:
fig = plt.figure(figsize=(12, 12))
ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(embedding_threed[:, 0], embedding_threed[:, 1], embedding_threed[:, 2], c=colors, s = 0.1)
plt.show()

In [None]:
# Create the projection, replace the names with the desired ones, library_id = 2 is for the IOC
projection = Projection(
    projection_name="IOC Poses Filtered 3D UMAP",
    version="0.0.1",
    library_id=2,
    model_name="openpifpaf_fast",
    model_params={},
    data={},
    dimension=3,
    atlas_folder_path="",
    atlas_width=atlas_width,
    tile_size=max_tile_size,
    atlas_count=atlas_count,
    total_tiles=total_tiles,
    tiles_per_atlas=max_tiles_per_atlas,
)

projection_id = create_projection(projection)['projection_id']
print(f"Projection ID: {projection_id}")

In [None]:
# Create an entry in the map_projection_feature table for each feature, links features, media and coordinates
for i, row in df.iterrows():
    create_map_projection_feature(MapProjectionFeatureCreate(
        projection_id=projection_id,
        media_id=row.media_id,
        atlas_order=-1,
        index_in_atlas=-1,
        coordinates=[embedding_threed[i, 0], embedding_threed[i, 1], embedding_threed[i, 2]],
        feature_id=row.feature_id
    ))

# Binary files Projection

In [None]:
from emv.pipelines.ioc import PipelineIOC

In [None]:
pipeline_ioc = PipelineIOC()
pipeline_ioc.create_binary_pose_embeddings(force = True)

In [None]:
total_features = count_features_by_type("pose-binary-extracted")
print(f"Total features: {total_features}")

In [None]:
MAX_FEATURES = total_features + 1
data = get_features_by_type_paginated("pose-binary-extracted", page_size=10000)

for _ in tqdm(range(MAX_FEATURES // 10000)):
    last_seen_id = data[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    data.extend(get_features_by_type_paginated("pose-binary-extracted", page_size=10000, last_seen_feature_id=last_seen_id))

In [None]:
data = pd.DataFrame(data)

In [None]:
ioc_metadata = pd.read_hdf("data/metadata.hdf5")
ioc_metadata["media_id"] = ioc_metadata.seq_id.map(lambda x: f"ioc-{x}")

In [None]:
data = data.merge(ioc_metadata[["media_id", "sport"]], on="media_id", how="left")

In [None]:
def sample_df_by_category_round_robin(df, N, category_col="sport"):
    """
    Sample N rows from a DataFrame using a round-robin strategy across categories.
    
    Args:
        df (pd.DataFrame): Input DataFrame with a categorical column.
        N (int): Total number of rows to sample.
        category_col (str): Name of the column containing category labels.
    
    Returns:
        pd.DataFrame: Sampled rows.
    """
    # Group items by category
    grouped = {cat: group_df.index.tolist() for cat, group_df in df.groupby(category_col)}
    
    # Track sampled indices
    sampled_indices = []
    
    # Round-robin sampling
    while len(sampled_indices) < N:
        did_sample = False
        for cat in sorted(grouped):  # consistent category order
            if grouped[cat]:
                idx = random.choice(grouped[cat])
                grouped[cat].remove(idx)
                sampled_indices.append(idx)
                did_sample = True
                if len(sampled_indices) >= N:
                    break
        if not did_sample:
            break  # All groups exhausted

    return df.loc[sampled_indices].reset_index(drop=True)


def sample_df_proportional_no_replacement(df, N, category_col="sport", random_state=None):
    """
    Sample exactly N unique rows from df, proportionally to category size, 
    without replacement. If some categories are too small, redistribute the 
    shortfall to larger categories.

    Args:
        df (pd.DataFrame): Input DataFrame with a categorical column.
        N (int): Total number of rows to sample.
        category_col (str): Column with categorical labels.
        random_state (int, optional): Seed for reproducibility.

    Returns:
        pd.DataFrame: Sampled DataFrame with exactly N unique rows.
    """
    rng = np.random.default_rng(random_state)
    grouped = df.groupby(category_col)

    # Step 1: Initial allocation
    category_counts = grouped.size()
    proportions = category_counts / category_counts.sum()
    allocated = (proportions * N).astype(int)

    # Step 2: Redistribute remainder fairly
    remainder = N - allocated.sum()
    if remainder > 0:
        fractions = (proportions * N) - allocated
        for cat in fractions.sort_values(ascending=False).index:
            if remainder == 0:
                break
            allocated[cat] += 1
            remainder -= 1

    # Step 3: Sample without replacement, track shortfall
    sampled_dfs = []
    deficit = 0
    available_pool = {}

    for cat, count in allocated.items():
        df_cat = grouped.get_group(cat)
        max_available = len(df_cat)

        if count <= max_available:
            sampled_dfs.append(df_cat.sample(n=count, random_state=rng.integers(0, 1e9)))
        else:
            # Take all available, note deficit
            sampled_dfs.append(df_cat)
            deficit += (count - max_available)

        # Store remaining pool for redistribution
        available_pool[cat] = df_cat.copy()

    # Step 4: Redistribute deficit to large categories with leftovers
    already_sampled_ids = pd.concat(sampled_dfs).index
    remaining_pool = df.loc[~df.index.isin(already_sampled_ids)]

    if deficit > 0 and len(remaining_pool) >= deficit:
        redistribute_sample = remaining_pool.sample(n=deficit, random_state=rng.integers(0, 1e9))
        sampled_dfs.append(redistribute_sample)
    elif deficit > 0:
        raise ValueError("Not enough unique items in the dataset to satisfy N without replacement.")

    return pd.concat(sampled_dfs).reset_index(drop=True)

In [None]:
def sample_df_with_external_proportions(df, N, proportions_dict, category_col="sport", random_state=None):
    """
    Sample exactly N unique rows from a DataFrame according to external proportions,
    without replacement. Shortfalls in undersized categories are reallocated.

    Args:
        df (pd.DataFrame): DataFrame containing the items to sample from.
        N (int): Total number of rows to sample.
        proportions_dict (dict): Desired proportions per category (e.g., {"soccer": 0.5}).
        category_col (str): Name of the column with categories.
        random_state (int, optional): Random seed for reproducibility.

    Returns:
        pd.DataFrame: Sampled DataFrame with exactly N rows.
    """
    rng = np.random.default_rng(random_state)

    # Normalize external proportions
    total_weight = sum(proportions_dict.values())
    normalized_props = {k: v / total_weight for k, v in proportions_dict.items()}

    # Initial allocation
    allocated = {cat: int(normalized_props.get(cat, 0) * N) for cat in df[category_col].unique()}
    
    # Distribute the remainder fairly
    allocated_total = sum(allocated.values())
    remainder = N - allocated_total
    if remainder > 0:
        # Distribute to categories with highest fractional part
        fractional_parts = {
            cat: (normalized_props.get(cat, 0) * N) - allocated.get(cat, 0)
            for cat in df[category_col].unique()
        }
        for cat in sorted(fractional_parts, key=fractional_parts.get, reverse=True):
            if remainder == 0:
                break
            allocated[cat] += 1
            remainder -= 1

    # Sampling phase
    sampled_dfs = []
    deficit = 0
    used_indices = set()

    for cat, count in allocated.items():
        df_cat = df[df[category_col] == cat]
        available = len(df_cat)

        if count <= available:
            sampled = df_cat.sample(n=count, random_state=rng.integers(0, 1e9))
        else:
            sampled = df_cat
            deficit += (count - available)
        sampled_dfs.append(sampled)
        used_indices.update(sampled.index)

    # Redistribute deficit if needed
    if deficit > 0:
        remaining_pool = df.loc[~df.index.isin(used_indices)]
        if len(remaining_pool) < deficit:
            raise ValueError("Not enough data to sample N items without replacement.")
        redistribute_sample = remaining_pool.sample(n=deficit, random_state=rng.integers(0, 1e9))
        sampled_dfs.append(redistribute_sample)

    return pd.concat(sampled_dfs).reset_index(drop=True)

In [None]:
sample_n = 220000
round_robin_sample = sample_df_by_category_round_robin(data, sample_n, category_col="sport")
proportional_sample = sample_df_proportional_no_replacement(data, sample_n, category_col="sport", random_state=42)

print(len(round_robin_sample), len(proportional_sample))

In [None]:
# Plot sport counts for each sample
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
round_robin_sample.sport.value_counts().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title("Round Robin Sample")
axes[0].set_xlabel("Sport")
axes[0].set_ylabel("Count")
proportional_sample.sport.value_counts().plot(kind='bar', ax=axes[1], color='salmon')
axes[1].set_title("Proportional Sample")
axes[1].set_xlabel("Sport")
axes[1].set_ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
ioc_metadata.groupby("sport").duration_sec.sum().sort_values(ascending=False).plot(kind="barh", ax=axs[0])
axs[0].set_title("Total duration per sport", fontweight = "bold")
ioc_metadata.groupby("sport").duration_sec.count().sort_values(ascending=False).plot(kind="barh", ax=axs[1])
axs[1].set_title("Total number of videos per sport", fontweight = "bold")
plt.tight_layout()
plt.show()

In [None]:
sample_n = 220000

duration_per_sport = ioc_metadata.groupby("sport").duration_sec.sum()
duration_per_sport = duration_per_sport / duration_per_sport.sum()

duration_prop_sample = sample_df_with_external_proportions(
    data, sample_n, duration_per_sport.to_dict(), category_col="sport", random_state=42
)

n_videos_per_sport = ioc_metadata.groupby("sport").duration_sec.count()
n_videos_per_sport = n_videos_per_sport / n_videos_per_sport.sum()

n_videos_prop_sample = sample_df_with_external_proportions(
    data, sample_n, n_videos_per_sport.to_dict(), category_col="sport", random_state=42
)

print(len(duration_prop_sample), len(n_videos_prop_sample))

In [None]:
# Plot sport counts for each sample
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
duration_prop_sample.sport.value_counts().plot(kind='bar', ax=axes[0], color='lightgreen')
axes[0].set_title("Duration Proportional Sample")
axes[0].set_xlabel("Sport")
axes[0].set_ylabel("Count")
n_videos_prop_sample.sport.value_counts().plot(kind='bar', ax=axes[1], color='lightcoral')
axes[1].set_title("Video Count Proportional Sample")
axes[1].set_xlabel("Sport")
axes[1].set_ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
def compute_cylinder_projection(df, min_dist=0.1, n_neighbors=100):
    cylinder_mapper = UMAP(output_metric=cylinder_euclidean_grad, min_dist=0.1, n_neighbors=100, random_state=42)
    features = np.array(df['embedding_33'].map(lambda x: literal_eval(x)).tolist())
    embedding = cylinder_mapper.fit_transform(features)
    df["projection"] = embedding.tolist()
        
    return df

In [None]:
# Map sports to colors
color_palette = sns.color_palette("Set2", n_colors=len(data.sport.unique()))
colors = data.sport.map(lambda x: color_palette[list(data.sport.unique()).index(x)])

In [None]:
duration_prop_sample = compute_cylinder_projection(duration_prop_sample)
#n_videos_prop_sample = compute_cylinder_projection(n_videos_prop_sample)

In [None]:
def plot_cylinder_projection(df, colors):
    # Panorama dimensions
    R_pano = 1
    H_pano = 1

    # Cylindrical dimension (theta) and height (h)
    cylinder_dimension = 2 * np.pi
    radius = R_pano  # Radius of the cylinder

    embedding = np.array(df['projection'].tolist())

    # Extract the cylindrical (theta) and linear (h) coordinates
    theta_coords = embedding[:, 0] % cylinder_dimension
    h_coords = embedding[:, 1]
    h_coords = H_pano * (h_coords - np.min(h_coords)) / (np.max(h_coords) - np.min(h_coords)) # Remap height to [0, H_pano] size of the Panorama

    # Convert cylindrical coordinates to Cartesian coordinates
    x = radius * np.cos(theta_coords)
    y = radius * np.sin(theta_coords)
    z = h_coords

    embedding_cartesian = np.stack([x, y, z], axis=1)

    fig = plt.figure(figsize=(20, 8))
    ax1 = fig.add_subplot(121, projection='3d')
    ax1.scatter(x, y, z, c=colors, s=0.1)
    ax1.set_title("Cylindrical projection", fontweight="bold")

    ax2 = fig.add_subplot(122)
    ax2.scatter(theta_coords, h_coords, c=colors, s=0.1)
    ax2.set_title("Unwrapped cylinder", fontweight="bold")

    plt.tight_layout()
    plt.show()

In [None]:
plot_cylinder_projection(duration_prop_sample, ["black"] * len(duration_prop_sample))

In [None]:
total_tiles = len(duration_prop_sample) # either all features or a subset of features
atlas_width = 4096
max_tile_size = 512
max_tiles_per_atlas = (atlas_width // max_tile_size) ** 2
atlas_count = int(total_tiles / max_tiles_per_atlas) + 1

In [None]:
# Create the projection, replace the names with the desired ones, library_id = 2 is for the IOC
projection = Projection(
    projection_name="IOC Poses Binary Cylindrical UMAP Duration Proportional",
    version="0.0.2",
    library_id=2,
    model_name="openpifpaf_fast",
    model_params={},
    data={},
    dimension=3,
    atlas_folder_path="",
    atlas_width=atlas_width,
    tile_size=max_tile_size,
    atlas_count=atlas_count,
    total_tiles=total_tiles,
    tiles_per_atlas=max_tiles_per_atlas,
)

projection_id = create_projection(projection)['projection_id']
print(f"Projection ID: {projection_id}")

In [None]:
def compute_cartesian_projection(embedding):
    # Panorama dimensions
    R_pano = 1
    H_pano = 1

    # Cylindrical dimension (theta) and height (h)
    cylinder_dimension = 2 * np.pi
    radius = R_pano  # Radius of the cylinder

    # Extract the cylindrical (theta) and linear (h) coordinates
    theta_coords = embedding[:, 0] % cylinder_dimension
    h_coords = embedding[:, 1]
    h_coords = H_pano * (h_coords - np.min(h_coords)) / (np.max(h_coords) - np.min(h_coords)) # Remap height to [0, H_pano] size of the Panorama

    # Convert cylindrical coordinates to Cartesian coordinates
    x = radius * np.cos(theta_coords)
    y = radius * np.sin(theta_coords)
    z = h_coords

    return np.stack([x, y, z], axis=1)

In [None]:
embedding_cartesian = compute_cartesian_projection(np.array(duration_prop_sample["projection"].tolist()))

# Create an entry in the map_projection_feature table for each feature, links features, media and coordinates
for i, row in duration_prop_sample.iterrows():
    create_map_projection_feature(MapProjectionFeatureCreate(
        projection_id=projection_id,
        media_id=row.media_id,
        atlas_order=-1,
        index_in_atlas=-1,
        coordinates=[embedding_cartesian[i, 0], embedding_cartesian[i, 1], embedding_cartesian[i, 2]],
        feature_id=row.feature_id
    ))