In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
import os
from emv.db.dao import DataAccessObject
from emv.db.queries import get_features_by_type_paginated, count_features_by_type
from sqlalchemy.sql import text
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import numpy as np
import cv2
from PIL import Image
from emv.api.models import Feature
from emv.api.models import Projection, MapProjectionFeatureCreate
from emv.db.queries import create_projection, create_map_projection_feature, create_feature, get_library_id_from_name, get_all_media_by_library_id, count_media_by_library_id
from umap import UMAP
import numba
from tqdm import tqdm
from emv.storage.storage import get_storage_client

# Load data

In [None]:
lib_id = get_library_id_from_name("mjf")
n_videos = count_media_by_library_id(lib_id, media_type = "video")
print(f"Total binaries in MJF library: {n_videos}")

In [None]:
MAX_FEATURES = n_videos + 1
videos = get_all_media_by_library_id(lib_id, page_size = 1000, media_type = "video")

for _ in tqdm(range(MAX_FEATURES // 1000)):
    last_seen_id = videos[-1].get("media_id", None)
    if last_seen_id is None:
        break
    videos.extend(get_all_media_by_library_id(lib_id, page_size = 1000, media_type = "video", last_seen_media_id=last_seen_id))

In [None]:
videos.metadata.iloc[0]

In [None]:
videos["top_genre"] = videos.metadata.map(lambda x: x.get("top_genre", None))

In [None]:
videos["top_genre"].value_counts()

In [None]:
videos = pd.DataFrame(videos)

In [None]:
genres = pd.read_csv("data/mjf_vectors_genre.csv")

In [None]:
features = genres.copy()
features["genres_f"] = genres.drop("media_id", axis = 1).agg(list, axis = 1).tolist()
features = features[["media_id", "genres_f"]]

In [None]:
features

# Add features to DB
Since embedding_sizes in the DB are restrained to 33, 1024 or larger, we pad with 0s to the closest larger size.

In [None]:
true_embedding_size = len(features.genres_f.iloc[0])
print(f"True embedding size: {true_embedding_size}")

In [None]:
features["embedding_1024"] = features["genres_f"].map(lambda x: x + [0] * (1024 - len(x)))

In [None]:
len(features["embedding_1024"].iloc[0])

In [None]:
for i,row in features.iterrows():
    feature = Feature(
        feature_type='musical_genre',
        version="1",
        model_name='Essentia Tensorflow MTG-Jamendo',
        model_params={'pretrained model': 'MTG-Jamendo'},
        data={
            "true_embedding_size": true_embedding_size
            },
        media_id=row['media_id'], 
        embedding_size=1024,
        embedding_1024=row['embedding_1024']
    )
    create_feature(feature)

# Create projection

## Compute 3D projection

In [None]:
total_features = count_features_by_type("musical_genre")
print(f"Total features: {total_features}")

In [None]:
MAX_FEATURES = total_features + 1
features = get_features_by_type_paginated("musical_genre", page_size=10000)

for _ in tqdm(range(MAX_FEATURES // 10000)):
    last_seen_id = features[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    features.extend(get_features_by_type_paginated("musical_genre", page_size=10000, last_seen_feature_id=last_seen_id))
    
features = pd.DataFrame(features)
print(f"Total features: {len(features)}")

In [None]:
feature_vectors = features.apply(lambda df: literal_eval(df["embedding_1024"])[:df["data"]["true_embedding_size"]], axis=1)

In [None]:
n_neighbors = [50, 100, 500, 1000]
min_dists = [0.1, 0.3, 0.5, 0.7]

In [None]:
FORCE_RECALC = False

umap_embeddings = []
for n in n_neighbors:
    for min_dist in min_dists:
        if os.path.exists(f"data/embeddings/umap_embeddings_{n}_{min_dist}.npy" and not FORCE_RECALC):
            print(f"Loading UMAP embeddings for n_neighbors={n}, min_dist={min_dist} from disk")
            embeddings_results = np.load(f"data/embeddings/umap_embeddings_{n}_{min_dist}.npy")
        else:
            print(f"Calculating UMAP embeddings for n_neighbors={n}, min_dist={min_dist}")
            umap = UMAP(n_neighbors=n, min_dist=min_dist, metric='cosine', n_components=3, random_state=42)
            embeddings_results = umap.fit_transform(feature_vectors.tolist())
            np.save(f"data/embeddings/umap_embeddings_{n}_{min_dist}.npy", embeddings_results)
        umap_embeddings.append(embeddings_results)

In [None]:
# Grid of 3D plots
n_rows, n_cols = len(n_neighbors), len(min_dists)
fig = plt.figure(figsize=(20, 20))

# Loop through each subplot
for i in range(1, n_rows * n_cols + 1):
    ax = fig.add_subplot(n_rows, n_cols, i, projection='3d')
    ax.set_title(f"UMAP n_neighbors={n_neighbors[(i-1)//n_cols]} min_dist={min_dists[(i-1)%n_cols]}")
    ax.scatter(umap_embeddings[i-1][:, 0], umap_embeddings[i-1][:, 1], umap_embeddings[i-1][:, 2], s = 0.05, alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
features["umap_3d"] = umap_embeddings[-1].tolist()

## Initialize projection

In [None]:
total_tiles = len(features) # either all features or a subset of features
atlas_width = 4096
max_tile_size = 512
max_tiles_per_atlas = (atlas_width // max_tile_size) ** 2
atlas_count = int(total_tiles / max_tiles_per_atlas) + 1

In [None]:
# Create the projection, replace the names with the desired ones
projection = Projection(
    projection_name="MJF Musical Genre 39k",
    version="1",
    library_id=get_library_id_from_name("mjf"),
    model_name="umap",
    model_params={"n_neighbors": 1000, "min_dist": 0.5},
    data={},
    dimension=3,
    atlas_folder_path="",
    atlas_width=atlas_width,
    tile_size=max_tile_size,
    atlas_count=atlas_count,
    total_tiles=total_tiles,
    tiles_per_atlas=max_tiles_per_atlas,
)

projection_id = create_projection(projection)['projection_id']
print(f"Projection ID: {projection_id}")

## Generate thumbnails atlases

In [None]:
lib_id = get_library_id_from_name("mjf")
n_thumbnails = count_media_by_library_id(lib_id, media_type = "image")
print(f"Total binaries in MJF library: {n_thumbnails}")

In [None]:
MAX_FEATURES = n_thumbnails + 1
thumbnails_data = get_all_media_by_library_id(lib_id, page_size = 1000, media_type = "image")

for _ in tqdm(range(MAX_FEATURES // 1000)):
    last_seen_id = thumbnails_data[-1].get("media_id", None)
    if last_seen_id is None:
        break
    thumbnails_data.extend(get_all_media_by_library_id(lib_id, page_size = 1000, media_type = "image", last_seen_media_id=last_seen_id))
    
thumbnails_data = pd.DataFrame(thumbnails_data)

In [None]:
thumbnails_paths = thumbnails_data["media_path"].tolist()

In [None]:
storage_client = get_storage_client()

def get_thumbnail(media_path):
    frame_bytes = storage_client.get_bytes("mjf", media_path)
    if type(frame_bytes) == bytes:
        frame = cv2.imdecode(np.frombuffer(frame_bytes, np.uint8), -1)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = Image.fromarray(frame)
    else:
        frame = None
        
    return frame

In [None]:
thumbnails = [get_thumbnail(p) for p in tqdm(thumbnails_paths)]

In [None]:
# Replace missing thumbnails with black images
black_image = np.zeros((460, 460, 3), dtype=np.uint8)
thumbnails = [img if img is not None else black_image for img in thumbnails]

In [None]:
square_atlases = create_square_atlases(atlas_name="atlas_mjf",
                                       projection_id=projection_id, 
                                       images=thumbnails, 
                                       width=atlas_width, 
                                       max_tile_size=max_tile_size, 
                                       no_border=True)

## Add Projection to DB

In [None]:
for i, row in features.iterrows():
    create_map_projection_feature(MapProjectionFeatureCreate(
        projection_id=projection_id,
        media_id=row.media_id,
        atlas_order=i // max_tiles_per_atlas,
        index_in_atlas=i % max_tiles_per_atlas,
        coordinates=row["umap_3d"],
        feature_id=row.feature_id
    ))