# Dolly vs. Pythia Comparison

This notebooks uses Phoenix to visualize the embeddings of prompt-response pairs generated using Dolly and Pythia.

In [None]:
!pip install -q arize-phoenix

Import libraries.

In [None]:
import ast
import hashlib
import re

import numpy as np
import pandas as pd
import phoenix as px

pd.set_option("display.max_colwidth", None)

In [None]:
# pythia_file = "https://storage.googleapis.com/arize-assets/fixtures/Embeddings/GENERATIVE/pythia-2.8b_2023-05-27_16-54-20.csv"
pythia_file = "https://storage.googleapis.com/arize-assets/fixtures/Embeddings/GENERATIVE/pythia-2.8b_2023-06-01_02-51-40.csv"
# pythia_file = "https://storage.googleapis.com/arize-assets/fixtures/Embeddings/GENERATIVE/pythia-2.8b-deduped_2023-06-01_06-37-21.csv"
# pythia_file = "https://storage.googleapis.com/arize-assets/fixtures/Embeddings/GENERATIVE/pythia-2.8b_2023-06-01_06-35-09.csv"
dolly_file = "https://storage.googleapis.com/arize-assets/fixtures/Embeddings/GENERATIVE/dolly-v2-3b_2023-06-01_02-12-20.csv"
# dolly_file = "https://storage.googleapis.com/arize-assets/fixtures/Embeddings/GENERATIVE/dolly-v2-3b_2023-05-27_18-23-02.csv"

Download your data.

In [None]:
def string_to_array(s):
    numbers = re.findall(r"[-+]?\d*\.\d+e[+-]?\d+|[-+]?\d+\.\d*|[-+]?\d+", s)
    return np.array([float(num) for num in numbers])


pythia_df = pd.read_csv(pythia_file)
pythia_df["prompt_embedding"] = pythia_df["prompt_embedding_vec"].apply(string_to_array)
pythia_df["paragraph_embedding"] = pythia_df["paragraph_embedding_vec"].apply(string_to_array)
pythia_df = pythia_df.drop("paragraph_embedding_vec", axis=1)
pythia_df = pythia_df.drop("prompt_embedding_vec", axis=1)

dolly_df = pd.read_csv(dolly_file)
dolly_df["prompt_embedding"] = dolly_df["prompt_embedding_vec"].apply(string_to_array)
dolly_df["paragraph_embedding"] = dolly_df["paragraph_embedding_vec"].apply(string_to_array)
dolly_df = dolly_df.drop("paragraph_embedding_vec", axis=1)
dolly_df = dolly_df.drop("prompt_embedding_vec", axis=1)

View the first few rows of each dataset.

In [None]:
dolly_df.head()

In [None]:
pythia_df.head()

Compute a unique ID for each prompt by hashing the prompt. This allows you to correspond multiple datapoints responding to the same prompt to see how the response "unfolds" in the latent space of the model.

Here are some interesting prompt hashes to check out once you launch Phoenix.

(pythia-2.8b_2023-06-01_02-51-40)

- 934bc7ae9b678cb0ca42ecfc45239716 (Bernie Sanders)
- 934bc7ae9b678cb0ca42ecfc45239716 (pollution)
- cbee03fe7a6de75418dc69304f54b478
- a75eae577fe7f81538237e5b7c9eeeed (AWS)

In [None]:
def hash_string(string):
    md5_hash = hashlib.md5()
    md5_hash.update(string.encode("utf-8"))
    return md5_hash.hexdigest()


pythia_df["prompt_id"] = pythia_df.prompt.map(hash_string)
dolly_df["prompt_id"] = dolly_df.prompt.map(hash_string)

In [None]:
pythia_df["evals"] = pd.to_numeric(pythia_df["evals"], errors="coerce").fillna(0)

Find the mean evaluation score for the two datasets.

In [None]:
dolly_df["evals"].mean()

In [None]:
pythia_df["evals"].mean()

Launch Phoenix with one dataset, then the other.

In [None]:
schema = px.Schema(
    prompt_column_names=px.EmbeddingColumnNames(
        raw_data_column_name="prompt", vector_column_name="prompt_embedding"
    ),
    response_column_names=px.EmbeddingColumnNames(
        raw_data_column_name="response_paragraph", vector_column_name="paragraph_embedding"
    ),
    tag_column_names=[
        "prompt_category",
        "conversation_id",
        "response_capitalized",
        "response_text",
        "prompt_id",
    ],
)

In [None]:
pythia_ds = px.Dataset(dataframe=pythia_df, schema=schema, name="pythia")
dolly_ds = px.Dataset(dataframe=dolly_df, schema=schema, name="dolly")

In [None]:
session = px.launch_app(pythia_ds)

In [None]:
session = px.launch_app(dolly_ds)