# Phoenix Embeddings

This small tutorial goes over creating Phoenix's `Dataset` objects and using them to obtain a UMAP pointcloud using the `UMAPWidget`

In [None]:
from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema
from phoenix.pointcloud import DriftPointCloud, UMAPProjector
from phoenix.widgets import UMAPWidget

In [None]:
test_filename = "NLP_sentiment_classification_language_drift"

features = [
    "reviewer_age",
    "reviewer_gender",
    "product_category",
    "language",
]

embedding_features = {
    "embedding_feature": EmbeddingColumnNames(
        vector_column_name="text_vector",  # Will be name of embedding feature in the app
        data_column_name="text",
    ),
}

# Define a Schema() object for Arize to pick up data from the correct columns for logging
schema = Schema(
    prediction_id_column_name="prediction_id",
    timestamp_column_name="prediction_ts",
    prediction_label_column_name="pred_label",
    actual_label_column_name="label",
    feature_column_names=features,
    embedding_feature_column_names=embedding_features,
)

train_ds = Dataset.from_hdf(f"./fixtures/{test_filename}.hdf5", schema=schema, key="training")
prod_ds = Dataset.from_hdf(f"./fixtures/{test_filename}.hdf5", schema=schema, key="production")

In [None]:
# Obtain the point cloud
UMAP_hyperparameters = {
    "n_components": 3,
    "min_dist": 0,
}
projector = UMAPProjector(hyperparameters=UMAP_hyperparameters)
primary_pts, reference_pts, clusters = projector.project(prod_ds, train_ds, "embedding_feature")
pc = DriftPointCloud(primary_pts, reference_pts, clusters)

In [None]:
widget = UMAPWidget(pc.to_json())
widget.show()