# Embedding space visualization
This notebook tests t-SNE and UMAP algorithms to plot the embeddings and understand if clusters are formed depending on features such as type, style or century of the painting.

### 0. Import libraries and data

In [None]:
import numpy as np
import polars as pl
import plotly.express as px
from umap import UMAP
from sklearn.manifold import TSNE

INPUT_PATH_EMBEDDINGS = "../../data/embeddings/"

In [None]:
clip_embeddings = pl.read_json(
    f"{INPUT_PATH_EMBEDDINGS}clip_embeddings_test_clip_full_1e_6_diff_lr_not_frozen_features.json",
    infer_schema_length=2000,
).with_columns((pl.col("year") // 100 + 1).alias("century"))

projected_embeddings = pl.read_json(
    f"{INPUT_PATH_EMBEDDINGS}baseline_embeddings_test_projections_text_embedding_enhanced_features.json",
    infer_schema_length=2000,
).with_columns((pl.col("year") // 100 + 1).alias("century"))

- 2 types of embeddings
- type, style, century, object

In [None]:
def plot_embeddings(
    complete_input_embeddings,
    modality,
    feature_name,
    viz_algorithm="t-SNE",
    viz_algorithm_param=30,
    top_feature_values=10,
):
    feature_values = (
        complete_input_embeddings.filter(pl.col(feature_name).is_not_null())[feature_name]
        .value_counts()
        .sort("count", descending=True)[:top_feature_values][feature_name]
        .to_list()
    )
    input_embeddings = complete_input_embeddings.filter(pl.col(feature_name).is_in(feature_values))

    print(
        f"Original size {complete_input_embeddings.shape[0]} - filtered size {input_embeddings.shape[0]}"
    )

    if modality == "image":
        embeddings = np.stack(input_embeddings["embedding_object_image"].to_numpy())
    elif modality == "text":
        embeddings = np.stack(input_embeddings["text_embedding_enhanced"].to_numpy())

    features = [
        feature if feature is not None else "none"
        for feature in input_embeddings[feature_name].to_numpy()
    ]

    if viz_algorithm == "t-SNE":
        tsne = TSNE(
            n_components=2, perplexity=viz_algorithm_param, random_state=42, metric="cosine"
        )
        embeddings_2d = tsne.fit_transform(embeddings)
    elif viz_algorithm == "UMAP":
        umap = UMAP(
            n_neighbors=viz_algorithm_param,
            min_dist=0.9,
            n_components=2,
            random_state=42,
            n_jobs=1,
            metric="cosine",
        )
        embeddings_2d = umap.fit_transform(embeddings)

    embeddings_2d = pl.DataFrame(embeddings_2d).with_columns(
        pl.Series(features).cast(pl.String).alias("type")
    )

    fig = px.scatter(
        embeddings_2d,
        x="column_0",
        y="column_1",
        color="type",
        symbol="type",
        title=f"{viz_algorithm} Visualization of Embeddings by Painting {feature_name.capitalize()}",
    )

    fig.show()

### LABEL
**projected**
- text
    - t-SNE 10; UMAP 5
- image
    - t-SNE 5, UMAP 5

**clip**
- text
    - t-SNE 10 / 50; UMAP ?
- image
    - t-SNE 10; UMAP 5

### TYPE
**projected**
- text
    - t-SNE 30; UMAP 30
- image
    - t-SNE 30, UMAP 30 - in this case, UMAP produces a very uniformly spread distribution of points

**clip**
- text
    - t-SNE 10/ 30; UMAP 30 - no clusters can be detected (UMAP tends to leave some outlier points)
- image
    - t-SNE 30; UMAP 30 - there is a tendency of data to be grouped in clusters

In [None]:
for perplexity in [5, 10, 30, 50]:
    plot_embeddings(projected_embeddings.clone(), "image", "coarse_type", "t-SNE", perplexity)

In [None]:
for n_neightbors in [5, 10, 30, 50]:
    plot_embeddings(projected_embeddings.clone(), "image", "coarse_type", "UMAP", n_neightbors)