# Phoenix Embeddings

This small tutorial goes over creating Phoenix's `Dataset` objects and using them to obtain a UMAP pointcloud using the `UMAPWidget`

In [1]:
from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema
from phoenix.pointcloud import DriftPointCloud, UMAPProjector
from phoenix.widgets import UMAPWidget

In [2]:
test_local_filename = "NLP_sentiment_classification_language_drift"
test_url_filename = "https://storage.googleapis.com/arize-assets/fixtures/open-source/datasets/unstructured/nlp/sentiment_classification_language_drift"

features = [
    "reviewer_age",
    "reviewer_gender",
    "product_category",
    "language",
]

embedding_features = {
    "embedding_feature": EmbeddingColumnNames(
        vector_column_name="text_vector",  # Will be name of embedding feature in the app
        raw_data_column_name="text",
    ),
}

# Define a Schema() object for Arize to pick up data from the correct columns for logging
schema = Schema(
    timestamp_column_name="prediction_ts",
    prediction_label_column_name="pred_label",
    actual_label_column_name="label",
    feature_column_names=features,
    embedding_feature_column_names=embedding_features,
)
schema2 = Schema(
    timestamp_column_name="prediction_ts",
    prediction_label_column_name="pred_label",
    actual_label_column_name="label",
    feature_column_names=features,
    embedding_feature_column_names=embedding_features,
)

In [3]:
desired_format = "url_hdf5"
train_ds = Dataset.from_url(f"{test_url_filename}.hdf5", schema=schema, hdf_key="training")

Downloading file: sentiment_classification_language_drift.hdf5

Dataset info written to '/Users/xandersong/phoenix/datasets/dataset_af9b4f2e-99ee-4fc6-b826-cf680cafeac6'
Dataset already persisted
Dataset: dataset_af9b4f2e-99ee-4fc6-b826-cf680cafeac6 initialized


In [None]:
schema

In [4]:
prod_ds = Dataset.from_url(f"{test_url_filename}.hdf5", schema=schema2, hdf_key="production")

Downloading file: sentiment_classification_language_drift.hdf5

Dataset info written to '/Users/xandersong/phoenix/datasets/dataset_c4661d9c-91a6-48de-9d40-07d491646307'
Dataset already persisted
Dataset: dataset_c4661d9c-91a6-48de-9d40-07d491646307 initialized


## Obtain the point cloud


In [5]:
UMAP_hyperparameters = {
    "n_components": 3,
    "min_dist": 0,
}
projector = UMAPProjector(hyperparameters=UMAP_hyperparameters)
primary_pts, reference_pts, clusters = projector.project(prod_ds, train_ds, "embedding_feature")
pc = DriftPointCloud(primary_pts, reference_pts, clusters)

AttributeError: 'dict' object has no attribute 'vector_column_name'

In [None]:
widget = UMAPWidget(pc.to_json())
widget.show()