# Phoenix Embeddings

This small tutorial goes over creating Phoenix's `Dataset` objects and using them to obtain a UMAP pointcloud using the `UMAPWidget`

In [None]:
from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema
from phoenix.pointcloud import DriftPointCloud, UMAPProjector
from phoenix.widgets import UMAPWidget

In [None]:
test_local_filename = "NLP_sentiment_classification_language_drift"
test_url_filename = "https://storage.googleapis.com/arize-assets/fixtures/open-source/datasets/unstructured/nlp/sentiment_classification_language_drift"

features = [
    "reviewer_age",
    "reviewer_gender",
    "product_category",
    "language",
]

embedding_features = {
    "embedding_feature": EmbeddingColumnNames(
        vector_column_name="text_vector",  # Will be name of embedding feature in the app
        raw_data_column_name="text",
    ),
}

# Define a Schema() object for Arize to pick up data from the correct columns for logging
schema = Schema(
    timestamp_column_name="prediction_ts",
    prediction_label_column_name="pred_label",
    actual_label_column_name="label",
    feature_column_names=features,
    embedding_feature_column_names=embedding_features,
)

desired_format = "url_hdf5"

if desired_format == "hdf5":
    train_ds = Dataset.from_hdf(
        f"./fixtures/{test_local_filename}.hdf5", schema=schema, key="training"
    )
    prod_ds = Dataset.from_hdf(
        f"./fixtures/{test_local_filename}.hdf5", schema=schema, key="production"
    )
elif desired_format == "url_hdf5":
    train_ds = Dataset.from_url(f"{test_url_filename}.hdf5", schema=schema, hdf_key="training")
    prod_ds = Dataset.from_url(f"{test_url_filename}.hdf5", schema=schema, hdf_key="production")
elif desired_format == "csv":
    train_ds = Dataset.from_csv(f"./fixtures/{test_local_filename}_training.csv", schema=schema)
    prod_ds = Dataset.from_csv(f"./fixtures/{test_local_filename}_production.csv", schema=schema)
elif desired_format == "url_csv":
    train_ds = Dataset.from_csv(f"{test_url_filename}_training.csv", schema=schema)
    prod_ds = Dataset.from_csv(f"{test_url_filename}_production.csv", schema=schema)

## Obtain the point cloud


In [None]:
UMAP_hyperparameters = {
    "n_components": 3,
    "min_dist": 0,
}
projector = UMAPProjector(hyperparameters=UMAP_hyperparameters)
primary_pts, reference_pts, clusters = projector.project(prod_ds, train_ds, "embedding_feature")
pc = DriftPointCloud(primary_pts, reference_pts, clusters)

In [None]:
widget = UMAPWidget(pc.to_json())
widget.show()