# Embedding space visualization
This notebook tests t-SNE and UMAP algorithms to plot the embeddings and understand if clusters are formed depending on features such as type, style or century of the painting.

### 0. Import libraries and data

In [None]:
import re
import copy
import json
import numpy as np
import polars as pl
import plotly.express as px
from umap import UMAP
from sklearn.manifold import TSNE

INPUT_PATH_EMBEDDINGS = "../../data/embeddings/"

In [None]:
clip_embeddings = pl.read_json(f"{INPUT_PATH_EMBEDDINGS}clip_embeddings_test_clip_full_1e_6_diff_lr_not_frozen_features.json", infer_schema_length=2000)
projected_embeddings = pl.read_json(f"{INPUT_PATH_EMBEDDINGS}baseline_embeddings_test_projections_text_embedding_enhanced_features.json", infer_schema_length=2000)

In [None]:
clip_description_embeddings = np.stack(clip_embeddings["text_embedding_enhanced"].to_numpy())
clip_description_embeddings

In [None]:
clip_painting_types = [coarse_type if coarse_type is not None else "none" for coarse_type in clip_embeddings["label"].to_numpy()]
clip_painting_types

In [None]:
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
clip_description_embeddings_2d = tsne.fit_transform(clip_description_embeddings)

In [None]:
clip_description_embeddings_2d_type = pl.DataFrame(clip_description_embeddings_2d).with_columns(pl.Series(clip_painting_types).cast(pl.String).alias("type"))
clip_description_embeddings_2d_type

In [None]:
fig_tsne = px.scatter(
    clip_description_embeddings_2d_type,
    x='column_0',
    y='column_1',
    color='type',
    symbol='type',
    title='t-SNE Visualization of Embeddings by Painting Style'
)

fig_tsne.show()

In [None]:
umap = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
clip_description_embeddings_2d_type = umap.fit_transform(clip_description_embeddings)


In [None]:
!pip install umap-learn