# Phoenix Embeddings

This small tutorial goes over creating Phoenix's `Dataset` objects and using them to obtain a UMAP pointcloud using the `UMAPWidget`

In [1]:
from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema
from phoenix.pointcloud import DriftPointCloud, UMAPProjector
from phoenix.widgets import UMAPWidget

In [2]:
test_local_filename = "NLP_sentiment_classification_language_drift"
test_url_filename = "https://storage.googleapis.com/arize-assets/fixtures/OpenSource/Embeddings/datasets/NLP/NLP_sentiment_classification_language_drift"

features = [
    "reviewer_age",
    "reviewer_gender",
    "product_category",
    "language",
]

embedding_features = {
    "embedding_feature": EmbeddingColumnNames(
        vector_column_name="text_vector",  # Will be name of embedding feature in the app
        raw_data_column_name="text",
    ),
}

# Define a Schema() object for Arize to pick up data from the correct columns for logging
schema = Schema(
    timestamp_column_name="prediction_ts",
    prediction_label_column_name="pred_label",
    actual_label_column_name="label",
    feature_column_names=features,
    embedding_feature_column_names=embedding_features,
)

desired_format = "url_hdf5"

if desired_format == "hdf5":
    train_ds = Dataset.from_hdf(
        f"./fixtures/{test_local_filename}.hdf5", schema=schema, keys="training"
    )
    prod_ds = Dataset.from_hdf(
        f"./fixtures/{test_local_filename}.hdf5", schema=schema, keys="production"
    )
elif desired_format == "url_hdf5":
    train_ds = Dataset.from_hdf(f"{test_url_filename}.hdf5", schema=schema, keys="training")
    prod_ds = Dataset.from_hdf(f"{test_url_filename}.hdf5", schema=schema, keys="production")
elif desired_format == "csv":
    train_ds = Dataset.from_csv(f"./fixtures/{test_local_filename}_training.csv", schema=schema)
    prod_ds = Dataset.from_csv(f"./fixtures/{test_local_filename}_production.csv", schema=schema)
elif desired_format == "url_csv":
    train_ds = Dataset.from_csv(f"{test_url_filename}_training.csv", schema=schema)
    prod_ds = Dataset.from_csv(f"{test_url_filename}_production.csv", schema=schema)

Downloading file: NLP_sentiment_classification_language_drift.hdf5
Done!
Downloading file: NLP_sentiment_classification_language_drift.hdf5
Done!


In [3]:
train_ds.head()

Unnamed: 0,prediction_ts,reviewer_age,reviewer_gender,product_category,language,text,text_vector,label,pred_label
0,1650092000.0,21,female,apparel,english,Poor quality of fabric and ridiculously tight ...,"[-0.070516996, 0.6640034, 0.33579218, -0.26907...",negative,negative
1,1650093000.0,29,male,kitchen,english,"Love these glasses, thought they'd be everyday...","[-0.0024410924, -0.5406275, 0.31713492, -0.033...",positive,positive
2,1650093000.0,26,female,sports,english,"These are disgusting, it tastes like you are ""...","[0.40487882, 0.8235396, 0.38333943, -0.4269158...",negative,negative
3,1650093000.0,26,male,other,english,My husband has a pair of TaoTronics so I decid...,"[0.018816521, 0.53441304, 0.4907303, -0.024163...",neutral,neutral
4,1650093000.0,37,male,home_improvement,english,"Threads too deep. Engages on tank, but gasket ...","[-0.25348073, 0.31603432, 0.35810202, -0.24672...",negative,negative


In [4]:
cacacaca

NameError: name 'cacacaca' is not defined

In [5]:
ds = Dataset.from_hdf(
    f"./fixtures/{test_local_filename}.hdf5", schema=schema, keys=["training", "production"]
)

In [None]:
ds

In [None]:
from typing import Any, List, Literal, Optional, Sequence, TypeVar


T = TypeVar("T", bound=type[Any])


def list_of(lst: Sequence[object], tp: T) -> bool:
    return isinstance(lst, list) and all(isinstance(x, tp) for x in lst)

In [None]:
x = ["x", "y"]

In [None]:
list_of(x, str)

## Obtain the point cloud


In [None]:
UMAP_hyperparameters = {
    "n_components": 3,
    "min_dist": 0,
}
projector = UMAPProjector(hyperparameters=UMAP_hyperparameters)
primary_pts, reference_pts, clusters = projector.project(prod_ds, train_ds, "embedding_feature")
pc = DriftPointCloud(primary_pts, reference_pts, clusters)

In [None]:
widget = UMAPWidget(pc.to_json())
widget.show()