In [1]:
# Embedding Projection & Visualization
#
# This notebook loads pre-extracted anime embeddings from the recommender model,
# projects them to 2D/3D using dimensionality reduction, and visualizes the results.
!pip install plotly umap-learn

Collecting plotly
  Downloading plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-2.14.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn>=1.6 (from umap-learn)
  Downloading scikit_learn-1.8.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading plotly-6.5.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading umap_learn-0.5.9.post2-py3-none-any.whl (90 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading narwhals-2.14.0-py3-none-any.whl (430 kB)
[2K   [90m━━━━━━━

In [2]:
import json
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# Data paths
DATA_DIR = Path("./work/data")
EMBEDDINGS_PATH = DATA_DIR / "anime_embeddings.npy"
CORPUS_IDS_PATH = DATA_DIR / "corpus_ids.json"
METADATA_PATH = DATA_DIR / "processed-metadata.csv"

## Load Data

In [3]:
# Load embeddings
embeddings = np.load(EMBEDDINGS_PATH)
print(f"Embeddings shape: {embeddings.shape}")

# Load corpus IDs (maps index -> anime ID)
with open(CORPUS_IDS_PATH) as f:
    corpus_ids = json.load(f)
print(f"Corpus size: {len(corpus_ids)}")

# Load metadata
metadata_df = pd.read_csv(METADATA_PATH)
print(f"Metadata entries: {len(metadata_df)}")
metadata_df.head()

Embeddings shape: (6000, 4096)
Corpus size: 6000
Metadata entries: 23979


Unnamed: 0,id,title,title_english,related_anime,recommendations,aired_from_year,rating_count,average_rating,media_type,rating
0,1,Cowboy Bebop,Cowboy Bebop,"[{""node"": {""id"": 5, ""title"": ""Cowboy Bebop: Te...","[{""node"": {""id"": 205, ""title"": ""Samurai Champl...",1998,650947,4.650114,tv,r
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,"[{""node"": {""id"": 1, ""title"": ""Cowboy Bebop"", ""...","[{""node"": {""id"": 4106, ""title"": ""Trigun: Badla...",2001,150835,4.883701,movie,r
2,6,Trigun,Trigun,"[{""node"": {""id"": 4106, ""title"": ""Trigun: Badla...","[{""node"": {""id"": 1, ""title"": ""Cowboy Bebop"", ""...",1998,288723,3.66125,tv,pg_13
3,7,Witch Hunter Robin,Witch Hunter Robin,[],"[{""node"": {""id"": 2025, ""title"": ""Darker than B...",2002,45571,2.038862,tv,pg_13
4,8,Bouken Ou Beet,Beet the Vandel Buster,"[{""node"": {""id"": 1123, ""title"": ""Bouken Ou Bee...","[{""node"": {""id"": 417, ""title"": ""Ragnar\u00f6k ...",2004,5235,2.48233,tv,pg


In [4]:
# Create a dataframe linking corpus index to metadata
# corpus_ids[i] = anime_id for corpus index i
corpus_df = pd.DataFrame({
    "corpus_idx": range(len(corpus_ids)),
    "id": corpus_ids
})

# Merge with metadata
df = corpus_df.merge(metadata_df, on="id", how="left")

# Use English title if available, otherwise Japanese title
df["display_title"] = df["title_english"].fillna(df["title"])

print(f"Matched {df['title'].notna().sum()} / {len(df)} items with metadata")
df.head()

Matched 6000 / 6000 items with metadata


Unnamed: 0,corpus_idx,id,title,title_english,related_anime,recommendations,aired_from_year,rating_count,average_rating,media_type,rating,display_title
0,0,38000,Kimetsu no Yaiba,Demon Slayer: Kimetsu no Yaiba,"[{""node"": {""id"": 40456, ""title"": ""Kimetsu no Y...","[{""node"": {""id"": 40748, ""title"": ""Jujutsu Kais...",2019,1196964,6.338178,tv,r,Demon Slayer: Kimetsu no Yaiba
1,1,16498,Shingeki no Kyojin,Attack on Titan,"[{""node"": {""id"": 18397, ""title"": ""Shingeki no ...","[{""node"": {""id"": 28623, ""title"": ""Koutetsujou ...",2013,1184305,6.611223,tv,r,Attack on Titan
2,2,40748,Jujutsu Kaisen,Jujutsu Kaisen,"[{""node"": {""id"": 38777, ""title"": ""Jujutsu Kais...","[{""node"": {""id"": 38000, ""title"": ""Kimetsu no Y...",2020,1158795,6.250011,tv,r,Jujutsu Kaisen
3,3,1535,Death Note,Death Note,"[{""node"": {""id"": 2994, ""title"": ""Death Note: R...","[{""node"": {""id"": 1575, ""title"": ""Code Geass: H...",2006,1098323,6.445127,tv,r,Death Note
4,4,30276,One Punch Man,One-Punch Man,"[{""node"": {""id"": 31704, ""title"": ""One Punch Ma...","[{""node"": {""id"": 32182, ""title"": ""Mob Psycho 1...",2015,1053382,6.387316,tv,r,One-Punch Man


## Projection Configuration

Configure the dimensionality reduction method and parameters here. 
Easy to swap between UMAP, t-SNE, PyMDE, etc.

In [5]:
from abc import ABC, abstractmethod
from typing import Literal

class ProjectionMethod(ABC):
    """Base class for projection methods."""

    @abstractmethod
    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        """Project high-dimensional data to lower dimensions."""
        pass

    @property
    @abstractmethod
    def name(self) -> str:
        """Human-readable name for the method."""
        pass


class UMAPProjection(ProjectionMethod):
    """UMAP dimensionality reduction."""

    def __init__(
        self,
        n_components: int = 2,
        n_neighbors: int = 15,
        min_dist: float = 0.1,
        metric: str = "cosine",
        random_state: int = 42,
    ):
        self.n_components = n_components
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.metric = metric
        self.random_state = random_state

    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        import umap
        reducer = umap.UMAP(
            n_components=self.n_components,
            n_neighbors=self.n_neighbors,
            min_dist=self.min_dist,
            metric=self.metric,
            random_state=self.random_state,
        )
        return reducer.fit_transform(X)

    @property
    def name(self) -> str:
        return f"UMAP (n={self.n_neighbors}, d={self.min_dist})"


class TSNEProjection(ProjectionMethod):
    """t-SNE dimensionality reduction."""

    def __init__(
        self,
        n_components: int = 2,
        perplexity: float = 30.0,
        learning_rate: float = 200.0,
        n_iter: int = 1000,
        random_state: int = 42,
    ):
        self.n_components = n_components
        self.perplexity = perplexity
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.random_state = random_state

    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        from sklearn.manifold import TSNE
        reducer = TSNE(
            n_components=self.n_components,
            perplexity=self.perplexity,
            learning_rate=self.learning_rate,
            n_iter=self.n_iter,
            random_state=self.random_state,
        )
        return reducer.fit_transform(X)

    @property
    def name(self) -> str:
        return f"t-SNE (perp={self.perplexity})"


class PyMDEProjection(ProjectionMethod):
    """PyMDE (Minimum Distortion Embedding) dimensionality reduction."""

    def __init__(
        self,
        n_components: int = 2,
        n_neighbors: int = 15,
        repulsive_fraction: float = 0.5,
        random_state: int = 42,
    ):
        self.n_components = n_components
        self.n_neighbors = n_neighbors
        self.repulsive_fraction = repulsive_fraction
        self.random_state = random_state

    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        import pymde
        import torch
        torch.manual_seed(self.random_state)

        mde = pymde.preserve_neighbors(
            X,
            embedding_dim=self.n_components,
            n_neighbors=self.n_neighbors,
            repulsive_fraction=self.repulsive_fraction,
        )
        embedding = mde.embed()
        return embedding.cpu().numpy()

    @property
    def name(self) -> str:
        return f"PyMDE (n={self.n_neighbors})"


# ============================================================
# CONFIGURATION - Modify these to experiment
# ============================================================

N_COMPONENTS = 2  # 2 for 2D, 3 for 3D

# Choose your projection method:
# projection_method = UMAPProjection(n_components=N_COMPONENTS, n_neighbors=15, min_dist=0.1)
# projection_method = TSNEProjection(n_components=N_COMPONENTS, perplexity=30)
projection_method = UMAPProjection(n_components=N_COMPONENTS, n_neighbors=20, min_dist=0.01, metric='cosine')

print(f"Using: {projection_method.name}")

Using: UMAP (n=20, d=0.01)


## Run Projection

In [6]:
%%time

# Run the projection
projected = projection_method.fit_transform(embeddings)
print(f"Projected shape: {projected.shape}")

# Add projected coordinates to dataframe
if N_COMPONENTS == 2:
    df["x"] = projected[:, 0]
    df["y"] = projected[:, 1]
elif N_COMPONENTS == 3:
    df["x"] = projected[:, 0]
    df["y"] = projected[:, 1]
    df["z"] = projected[:, 2]

2025-12-26 23:52:50.136981: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-26 23:52:50.194675: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-26 23:52:50.437380: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-26 23:52:50.437404: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-26 23:52:50.438637: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

Projected shape: (6000, 2)
CPU times: user 20.4 s, sys: 3.57 s, total: 24 s
Wall time: 23.1 s


## Select Shows to Label

Pick notable/popular shows across different genres to help identify clusters.

In [7]:
# Shows to label on the visualization
# Add/remove titles here to customize labels
# Using a mix of genres and eras to help identify clusters

LABELED_SHOWS = [
    # Shonen action
    "Naruto",
    "One Piece",
    "Bleach",
    "Dragon Ball Z",
    "Hunter x Hunter (2011)",
    "My Hero Academia",
    "Jujutsu Kaisen",
    "Demon Slayer: Kimetsu no Yaiba",

    # Psychological/Thriller
    "Death Note",
    "Monster",
    "Psycho-Pass",
    "Steins;Gate",
    "Neon Genesis Evangelion",

    # Romance/Drama
    "Clannad",
    "Your Lie in April",
    "Toradora!",
    "Fruits Basket",
    "Violet Evergarden",

    # Sci-Fi
    "Cowboy Bebop",
    "Ghost in the Shell: Stand Alone Complex",
    "Code Geass: Lelouch of the Rebellion",
    "Cyberpunk: Edgerunners",

    # Fantasy
    "Fullmetal Alchemist: Brotherhood",
    "Attack on Titan",
    "Sword Art Online",
    "Re:Zero",
    "Made in Abyss",

    # Slice of Life
    "K-On!",
    "Nichijou",
    "A Place Further Than the Universe",

    # Sports
    "Haikyuu!!",
    "Slam Dunk",
    "Kuroko's Basketball",

    # Mecha
    "Mobile Suit Gundam",
    "Gurren Lagann",
    "86",

    # Isekai
    "Konosuba",
    "Overlord",
    "That Time I Got Reincarnated as a Slime",

    # Movies
    "Spirited Away",
    "Your Name.",
    "Akira",
    "Princess Mononoke",
]

# Find which shows are in our corpus
def find_show(title: str, df: pd.DataFrame) -> pd.Series | None:
    """Find a show by exact or partial title match."""
    # Try exact match on display_title first
    exact = df[df["display_title"] == title]
    if len(exact) == 1:
        return exact.iloc[0]

    # Try exact match on original title
    exact = df[df["title"] == title]
    if len(exact) == 1:
        return exact.iloc[0]

    # Try partial match
    partial = df[df["display_title"].str.contains(title, case=False, na=False)]
    if len(partial) >= 1:
        # Return the one with highest rating count (most popular)
        return partial.sort_values("rating_count", ascending=False).iloc[0]

    partial = df[df["title"].str.contains(title, case=False, na=False)]
    if len(partial) >= 1:
        return partial.sort_values("rating_count", ascending=False).iloc[0]

    return None

# Build labeled subset
labeled_rows = []
not_found = []
for title in LABELED_SHOWS:
    row = find_show(title, df)
    if row is not None:
        labeled_rows.append(row)
    else:
        not_found.append(title)

labeled_df = pd.DataFrame(labeled_rows)
print(f"Found {len(labeled_df)} / {len(LABELED_SHOWS)} shows to label")
if not_found:
    print(f"Not found: {not_found}")

Found 43 / 43 shows to label


## Visualization

In [8]:
def create_2d_scatter(
    df: pd.DataFrame,
    labeled_df: pd.DataFrame,
    method_name: str,
    color_by: str | None = "average_rating",
    point_size: int = 3,
    label_font_size: int = 10,
) -> go.Figure:
    """Create an interactive 2D scatter plot with labels."""

    # Create hover text
    hover_text = df.apply(
        lambda r: f"{r['display_title']}<br>Rating: {r['average_rating']:.2f}<br>Year: {r['aired_from_year']}",
        axis=1
    )

    fig = go.Figure()

    # Main scatter plot
    scatter_kwargs = dict(
        x=df["x"],
        y=df["y"],
        mode="markers",
        marker=dict(
            size=point_size,
            opacity=0.6,
        ),
        text=hover_text,
        hoverinfo="text",
        name="Anime",
    )

    if color_by and color_by in df.columns:
        scatter_kwargs["marker"]["color"] = df[color_by]
        scatter_kwargs["marker"]["colorscale"] = "Viridis"
        scatter_kwargs["marker"]["colorbar"] = dict(title=color_by)

    fig.add_trace(go.Scatter(**scatter_kwargs))

    # Add labels for selected shows
    if len(labeled_df) > 0:
        fig.add_trace(go.Scatter(
            x=labeled_df["x"],
            y=labeled_df["y"],
            mode="markers+text",
            marker=dict(size=8, color="red", symbol="diamond"),
            text=labeled_df["display_title"],
            textposition="top center",
            textfont=dict(size=label_font_size, color="black"),
            hoverinfo="text",
            hovertext=labeled_df["display_title"],
            name="Labeled",
        ))

    fig.update_layout(
        title=f"Anime Embedding - {method_name}",
        xaxis_title="Dimension 1",
        yaxis_title="Dimension 2",
        width=1200,
        height=900,
        showlegend=True,
    )

    return fig


def create_3d_scatter(
    df: pd.DataFrame,
    labeled_df: pd.DataFrame,
    method_name: str,
    color_by: str | None = "average_rating",
    point_size: int = 3,
) -> go.Figure:
    """Create an interactive 3D scatter plot with labels."""

    hover_text = df.apply(
        lambda r: f"{r['display_title']}<br>Rating: {r['average_rating']:.2f}<br>Year: {r['aired_from_year']}",
        axis=1
    )

    fig = go.Figure()

    scatter_kwargs = dict(
        x=df["x"],
        y=df["y"],
        z=df["z"],
        mode="markers",
        marker=dict(
            size=point_size,
            opacity=0.6,
        ),
        text=hover_text,
        hoverinfo="text",
        name="Anime",
    )

    if color_by and color_by in df.columns:
        scatter_kwargs["marker"]["color"] = df[color_by]
        scatter_kwargs["marker"]["colorscale"] = "Viridis"
        scatter_kwargs["marker"]["colorbar"] = dict(title=color_by)

    fig.add_trace(go.Scatter3d(**scatter_kwargs))

    # Add labeled points
    if len(labeled_df) > 0:
        fig.add_trace(go.Scatter3d(
            x=labeled_df["x"],
            y=labeled_df["y"],
            z=labeled_df["z"],
            mode="markers+text",
            marker=dict(size=6, color="red", symbol="diamond"),
            text=labeled_df["display_title"],
            textposition="top center",
            textfont=dict(size=10),
            hoverinfo="text",
            hovertext=labeled_df["display_title"],
            name="Labeled",
        ))

    fig.update_layout(
        title=f"Anime Embedding - {method_name}",
        width=1200,
        height=900,
        showlegend=True,
    )

    return fig

In [9]:
# Create and display the visualization
if N_COMPONENTS == 2:
    fig = create_2d_scatter(df, labeled_df, projection_method.name, color_by="average_rating")
else:
    fig = create_3d_scatter(df, labeled_df, projection_method.name, color_by="average_rating")

fig.show()

## Alternative Coloring

Try different color schemes to explore patterns.

In [10]:
# Color by year - useful for seeing temporal clustering
if N_COMPONENTS == 2:
    fig_year = create_2d_scatter(df, labeled_df, projection_method.name, color_by="aired_from_year")
else:
    fig_year = create_3d_scatter(df, labeled_df, projection_method.name, color_by="aired_from_year")

fig_year.update_layout(title=f"Anime Embedding by Year - {projection_method.name}")
fig_year.show()

In [11]:
# Color by popularity (rating count) - log scale for better distribution
df["log_rating_count"] = np.log10(df["rating_count"].fillna(1) + 1)

if N_COMPONENTS == 2:
    fig_pop = create_2d_scatter(df, labeled_df, projection_method.name, color_by="log_rating_count")
else:
    fig_pop = create_3d_scatter(df, labeled_df, projection_method.name, color_by="log_rating_count")

fig_pop.update_layout(title=f"Anime Embedding by Popularity (log scale) - {projection_method.name}")
fig_pop.show()

## Export Projection

Save the projection for use in the web app or further analysis.

In [12]:
# Export to JSON for web use
export_data = []
for _, row in df.iterrows():
    entry = {
        "id": int(row["id"]),
        "title": row["display_title"],
        "x": float(row["x"]),
        "y": float(row["y"]),
    }
    if N_COMPONENTS == 3:
        entry["z"] = float(row["z"])
    export_data.append(entry)

# Save with a descriptive filename
output_filename = f"projected_embedding_{projection_method.name.lower().replace(' ', '_').replace('(', '').replace(')', '').replace(',', '').replace('=', '')}.json"
output_path = DATA_DIR / output_filename

with open(output_path, "w") as f:
    json.dump(export_data, f)

print(f"Saved projection to {output_path}")

Saved projection to work/data/projected_embedding_umap_n20_d0.01.json


## Quick Comparison: Different Methods

Uncomment and run to compare projections side-by-side.

In [13]:
# # Uncomment to run comparison of different methods
# from plotly.subplots import make_subplots

# methods = [
#     UMAPProjection(n_components=2, n_neighbors=15, min_dist=0.1),
#     UMAPProjection(n_components=2, n_neighbors=30, min_dist=0.05),
#     UMAPProjection(n_components=2, n_neighbors=50, min_dist=0.01),
#     # TSNEProjection(n_components=2, perplexity=30),
#     # TSNEProjection(n_components=2, perplexity=50),
#     # PyMDEProjection(n_components=2, n_neighbors=15),
# ]

# fig = make_subplots(
#     rows=1, cols=len(methods),
#     subplot_titles=[m.name for m in methods],
# )

# for i, method in enumerate(methods):
#     print(f"Running {method.name}...")
#     proj = method.fit_transform(embeddings)

#     fig.add_trace(
#         go.Scatter(
#             x=proj[:, 0],
#             y=proj[:, 1],
#             mode="markers",
#             marker=dict(size=2, opacity=0.5),
#             showlegend=False,
#         ),
#         row=1, col=i+1
#     )

# fig.update_layout(
#     title="Projection Method Comparison",
#     width=400 * len(methods),
#     height=400,
# )
# fig.show()

## Export for Web App

Export the 2D embedding in the format expected by the web visualization.

In [14]:
import math
from sklearn.neighbors import NearestNeighbors

# Configuration for export
N_NEIGHBORS_EXPORT = 20
METRIC_EXPORT = "cosine"

# Build points dict with x, y, color (year), and r (radius based on popularity)
points = {}
for idx, row in df.iterrows():
    # Default year 1900 to 2026
    year = row["aired_from_year"]
    if pd.isna(year) or year == 1900:
        year = 2026
    else:
        year = int(year)

    # Radius based on rating count: sqrt(rating_count) / 110 + 1
    rating_count = row["rating_count"] if pd.notna(row["rating_count"]) else 0
    radius = math.sqrt(rating_count) / 110.0 + 1.0

    points[str(idx)] = {
        "x": round(float(row["x"]), 4),
        "y": round(float(row["y"]), 4),
        "color": year,
        "r": round(radius, 4),
    }

# Compute nearest neighbors in the projected embedding space
projected_2d = projected[:, :2]  # Ensure we're using 2D
nn = NearestNeighbors(n_neighbors=N_NEIGHBORS_EXPORT + 1, metric="euclidean")  # +1 because point is its own neighbor
nn.fit(projected_2d)
distances, indices = nn.kneighbors(projected_2d)

# Build neighbors dict (exclude self which is always index 0)
neighbors_dict = {}
for idx in range(len(df)):
    # Skip the first neighbor (self) and take the rest
    neighbor_indices = indices[idx, 1:].tolist()
    neighbors_dict[str(idx)] = neighbor_indices

# Build the ids list (anime IDs in corpus order)
ids_list = df["id"].astype(int).tolist()

# Assemble the final export structure
export_structure = {
    "points": points,
    "metric": METRIC_EXPORT,
    "n_neighbors": N_NEIGHBORS_EXPORT,
    "ids": ids_list,
    "neighbors": {
        "metric": METRIC_EXPORT,
        "n_neighbors": N_NEIGHBORS_EXPORT,
        "_format": "expanded",
        "neighbors": neighbors_dict,
    }
}

# Save to file
output_path = DATA_DIR / "projected_model_embedding.json"
with open(output_path, "w") as f:
    json.dump(export_structure, f)

print(f"Exported web embedding to {output_path}")
print(f"  - {len(points)} points")
print(f"  - {N_NEIGHBORS_EXPORT} neighbors per point")
print(f"  - Metric: {METRIC_EXPORT}")

Exported web embedding to work/data/projected_model_embedding.json
  - 6000 points
  - 20 neighbors per point
  - Metric: cosine
