# Create Taylor Swift Embeddings Data

Use this colab to analyze trends in Taylor Swift's song lyrics using [Phoenix OSS](https://github.com/Arize-ai/phoenix). Download the Kaggle dataset [here](https://www.kaggle.com/datasets/PromptCloudHQ/taylor-swift-song-lyrics-from-all-the-albums?select=taylor_swift_lyrics.csv).

In [None]:
!pip install -q arize-phoenix

In [None]:
import pandas as pd
import phoenix as px

In [None]:
from google.colab import files

uploaded = files.upload()  # upload CSV from Kaggle

In [None]:
df = pd.read_csv("taylor_swift_lyrics.csv", encoding="ISO-8859-1")
df

In [None]:
!pip install -q arize["AutoEmbeddings"]

In [None]:
from arize.pandas.embeddings import EmbeddingGenerator, UseCases

df = df.reset_index(drop=True)

generator = EmbeddingGenerator.from_use_case(
    use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,
    model_name="distilbert-base-uncased",
    tokenizer_max_length=512,
    batch_size=100,
)
df["lyric_vector"] = generator.generate_embeddings(text_col=df["lyric"])

In [None]:
df

In [None]:
schema = px.Schema(
    embedding_feature_column_names={
        "taylors_embedding": px.EmbeddingColumnNames(
            vector_column_name="lyric_vector", raw_data_column_name="lyric"
        )
    },
    feature_column_names=["year", "album", "track_title"],
)

In [None]:
px.launch_app(px.Dataset(df, schema))

In [None]:
px.active_session().view()