# Embedding space visualization
This notebook tests t-SNE and UMAP algorithms to plot the embeddings and understand if clusters are formed depending on features such as type, style or century of the painting.

### 0. Import libraries and data

In [None]:
import numpy as np
import polars as pl
import plotly.express as px
from umap import UMAP
from sklearn.manifold import TSNE

INPUT_PATH_EMBEDDINGS = "../../data/embeddings/"

In [None]:
projected_embeddings = (
    pl.read_json(
        f"{INPUT_PATH_EMBEDDINGS}baseline_embeddings_test_projections_text_embedding_enhanced_features.json",
        infer_schema_length=1000,
    ).sort("painting_id")
    .with_row_index()
    .with_columns((f"test/" + pl.col("index").cast(pl.String) + ".png").alias("image_name"))
    .with_columns((pl.col("year") // 100 + 1).alias("century"))
)
probabilities = projected_embeddings["probability"]
projected_embeddings = projected_embeddings.sort("probability", descending=True).unique(subset=["object_description"], keep="first").sort("index").drop("index").with_row_index()


clip_embeddings = (
    pl.read_json(
        f"{INPUT_PATH_EMBEDDINGS}clip_embeddings_test_clip_full_1e_6_diff_lr_not_frozen_features.json",
        infer_schema_length=1000,
    )
    .with_columns((f"test/" + pl.col("index").cast(pl.String) + ".png").alias("image_name"))
    .with_columns((pl.col("year") // 100 + 1).alias("century"))
).with_columns(pl.Series(probabilities).alias("probability"))
clip_embeddings = clip_embeddings.sort("probability", descending=True).unique(subset=["object_description"], keep="first").sort("index").drop("index").with_row_index()

In [None]:
# clip_embeddings = pl.read_json(
#     f"{INPUT_PATH_EMBEDDINGS}clip_embeddings_test_clip_full_1e_6_diff_lr_not_frozen_features.json",
#     infer_schema_length=2000,
# ).with_columns((pl.col("year") // 100 + 1).alias("century"))

# projected_embeddings = pl.read_json(
#     f"{INPUT_PATH_EMBEDDINGS}baseline_embeddings_test_projections_text_embedding_enhanced_features.json",
#     infer_schema_length=2000,
# ).with_columns((pl.col("year") // 100 + 1).alias("century")).with_row_index()

In [None]:
def plot_embeddings(
    complete_input_embeddings,
    modality,
    feature_name,
    viz_algorithm="t-SNE",
    viz_algorithm_param=30,
    top_feature_values=10,
):
    feature_values = (
        complete_input_embeddings.filter(pl.col(feature_name).is_not_null())[feature_name]
        .value_counts()
        .sort("count", descending=True)[:top_feature_values][feature_name]
        .to_list()
    )
    input_embeddings = complete_input_embeddings.filter(pl.col(feature_name).is_in(feature_values))

    print(
        f"Original size {complete_input_embeddings.shape[0]} - filtered size {input_embeddings.shape[0]}"
    )

    if modality == "visual":
        embeddings = np.stack(input_embeddings["embedding_object_image"].to_numpy())
    elif modality == "textual":
        embeddings = np.stack(input_embeddings["text_embedding_enhanced"].to_numpy())

    if viz_algorithm == "t-SNE":
        tsne = TSNE(
            n_components=2, perplexity=viz_algorithm_param, random_state=42, metric="cosine", method="exact"
        )
        embeddings_2d = tsne.fit_transform(embeddings)
    elif viz_algorithm == "UMAP":
        umap = UMAP(
            n_neighbors=viz_algorithm_param,
            min_dist=0.9,
            n_components=2,
            random_state=42,
            n_jobs=1,
            metric="cosine",
        )
        embeddings_2d = umap.fit_transform(embeddings)

    embeddings_2d = pl.DataFrame(embeddings_2d).with_columns(
        pl.Series(input_embeddings[feature_name]).cast(pl.String).alias(feature_name)
    ).with_columns(
        pl.Series(input_embeddings["object_description"]).cast(pl.String).alias("object_description")
    ).with_columns(
        pl.Series(input_embeddings["label"]).cast(pl.String).alias("label")
    )

    fig = px.scatter(
        embeddings_2d.sort(feature_name),
        x="column_0",
        y="column_1",
        color=feature_name,
        symbol=feature_name,
        hover_data=[feature_name, "object_description", "label"],
        title=f"{viz_algorithm} Visualization of Embeddings by Painting {feature_name.capitalize()}",
    )

    fig.show()

- 2 types of embeddings
- 2 modalities
- type, style, century, object
---
- label: perplexity 10
- type: perplexity 10, 30
---
- Text modality
    - coarse type - no well-defined clusters, but CLIP tends to have denser regions with points of the same painting type (906 points)
    - style - the same, but the denser regions with the same attribute are not as  clearly visible as for type (411 points)
    - century - no clusters are distinguishable, the same as before (1163 points)
    - label - ?

- Visual modality
    - coarse type - no well-defined clusters, but CLIP tends to have denser regions with points of the same painting type
    - style - as in the case of style for textual modality
    - century - no clusters are distinguishable, the same as before
    - label - ?

In the case of visual modality, it's ok to have multiple images corresponding to the same descriptions
In the case of textual modality, there will be clusters of with points with the same description

In [None]:
plot_embeddings(clip_embeddings.clone(), "visual", "label", "t-SNE", 10)
plot_embeddings(projected_embeddings.clone(), "visual", "label", "t-SNE", 10)