In [None]:
!pip install -q accelerate arize-phoenix datasets openai transformers

In [None]:
import datetime
import re
import textwrap
import uuid

from datasets import load_dataset
import openai
import pandas as pd
import phoenix as px
import torch
import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer


openai.api_key = "your key here"

In [None]:
# model_type = "databricks/dolly-v2-3b"
model_type = "EleutherAI/pythia-2.8b"
tokenizer = AutoTokenizer.from_pretrained(model_type, padding_side="left")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(model_type, device_map="auto")
model.to(device)

In [None]:
def create_mean_embedding(text, model, tokenizer):
    """
    Creates an embedding for a piece of text by finding the embedding average of
    tokens. Averages over the sequence length dimension of the last hidden
    state.
    """

    tokens = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in tokens.items()}
    output = model(**inputs, output_hidden_states=True)
    hidden_states = output.hidden_states
    # The last hidden state is usually used as the embedding for the sequence
    # It has shape [batch_size, sequence_length, hidden_size]
    # To get an embedding for the entire sequence, you might average over the sequence length dimension
    embedding = hidden_states[-1][0].detach().cpu().mean(dim=0)
    return embedding.numpy()

In [None]:
dataset = load_dataset("c-s-ale/dolly-15k-instruction-alpaca-format")
dataset

In [None]:
df = dataset["train"].to_pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_colwidth", None)

ALPACA_PROMPT_TEMPLATE_NON_EMPTY_INPUT = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

ALPACA_PROMPT_TEMPLATE_EMPTY_INPUT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""

df["prompt"] = df.apply(
    lambda row: ALPACA_PROMPT_TEMPLATE_NON_EMPTY_INPUT.format(
        instruction=row["instruction"], input=row["input"]
    )
    if row["input"]
    else ALPACA_PROMPT_TEMPLATE_EMPTY_INPUT.format(instruction=row["instruction"]),
    axis=1,
)
df = df.groupby("category").head(12).reset_index(drop=True)
df

In [None]:
TEMPERATURE = 0.1

In [None]:
def find_eos_token_position(
    generated_ids,
    prompt_len,
    eos_token_id=50277,
):
    """
    Finds the position of the first EOS token in the generated_ids array, or
    returns -1 if not found. eos_token_id defaults to 50277 for Dolly, which
    does not match tokenizer.eos_token_id for some reason.
    """

    try:
        return (generated_ids[prompt_len:] == eos_token_id).nonzero(as_tuple=True)[0][
            0
        ] + prompt_len
    except IndexError:
        return None

In [None]:
output_df = pd.DataFrame(
    columns=[
        "conversation_id",
        "prompt",
        "response",
        "prompt_embedding",
        "response_embedding",
    ]
)
for index, row in df.iterrows():
    prompt = row["prompt"]
    print(f"Index: {index}")
    print()
    print(f"Prompt: {prompt}")
    print()
    prompt_inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = prompt_inputs["input_ids"].to(device)
    pad_token_id = tokenizer.pad_token_id
    prompt_len = len(input_ids[0])
    attention_mask = prompt_inputs.get("attention_mask", None).to(device)
    # Parameters grabbed from https://huggingface.co/databricks/dolly-v2-12b/blob/main/instruct_pipeline.py
    model_data_output = model.generate(
        input_ids,
        do_sample=True,
        temperature=TEMPERATURE,
        attention_mask=attention_mask,
        max_new_tokens=256,
        top_p=0.92,
        top_k=0,
        output_hidden_states=True,
        return_dict_in_generate=True,
        pad_token_id=tokenizer.pad_token_id,
    )
    generated_ids = model_data_output.sequences[0]
    # The current code has an issue where generation continues even if the EOS token is generated.
    # Find the position of the first EOS token and only decode until that position.
    # https://huggingface.co/databricks/dolly-v2-12b/discussions/19
    eos_token_position = find_eos_token_position(generated_ids, prompt_len)
    generated_text = tokenizer.decode(generated_ids[prompt_len:eos_token_position]).strip()
    print("Generated Text")
    print(generated_text)

    # Compute embeddings
    prompt_embedding = create_mean_embedding(prompt, model, tokenizer)
    response_embedding = create_mean_embedding(generated_text, model, tokenizer)

    # Add row to output dataframe
    row_df = pd.DataFrame(
        {
            "conversation_id": [str(uuid.uuid4())],
            "prompt": [prompt],
            "response": [generated_text],
            "prompt_embedding": [prompt_embedding],
            "response_embedding": [response_embedding],
        }
    )
    output_df = pd.concat([output_df, row_df], ignore_index=True)

In [None]:
EVALUATION_PROMPT_TEMPLATE = """Prompt:

{prompt}

Response:

{response}"""

EVALUATION_SYSTEM_MESSAGE = "You are an evaluation model that evaluates the quality of the response in prompt-response pairs. Please score the result from 0-1 based on how good the answer is, where 0 is the worst. You must respond only with floating point values in the interval [0, 1], inclusive of both endpoints."


def evaluate_response_quality(prompt, response, model="gpt-4"):
    evaluation_prompt = EVALUATION_PROMPT_TEMPLATE.format(prompt=prompt, response=response)
    api_response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": EVALUATION_SYSTEM_MESSAGE},
            {"role": "user", "content": evaluation_prompt},
        ],
        temperature=0.0,
    )
    evaluation_score = api_response.choices[0].message.content.strip()
    print(evaluation_prompt)
    print(evaluation_score)
    return evaluation_score


def compute_eval_column(dataframe):
    eval_column = dataframe.apply(
        lambda row: evaluate_response_quality(row["prompt"], row["response"]), axis=1
    )
    return eval_column


output_df["evaluation_score"] = compute_eval_column(output_df)

In [None]:
output_df

In [None]:
output_df["evaluation_score"] = pd.to_numeric(output_df["evaluation_score"], errors="coerce")
output_df["evaluation_score"].isna().sum()

In [None]:
len(output_df) - len(output_df.dropna())

In [None]:
output_df = output_df.dropna()

In [None]:
output_df["evaluation_score"].mean()

In [None]:
model_name = model_type.split("/")[-1]

In [None]:
now = datetime.datetime.now()
timestamp = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z")
file_name = f"{model_name}_{timestamp}_temp-{TEMPERATURE}.parquet"
output_df.to_parquet(file_name, index=False)

In [None]:
schema = px.Schema(
    prompt_column_names=px.EmbeddingColumnNames(
        raw_data_column_name="prompt", vector_column_name="prompt_embedding"
    ),
    response_column_names=px.EmbeddingColumnNames(
        raw_data_column_name="response", vector_column_name="response_embedding"
    ),
    tag_column_names=[
        # "prompt_category",
        "conversation_id",
        "evaluation_score",
    ],
)
ds = px.Dataset(output_df, schema, name=model_name)

In [None]:
session = px.launch_app(ds)