# Summary Evaluations

Download Amazon product reviews and parse the raw data into a pandas dataframe.

In [None]:
import gzip
from typing import List
from urllib.request import urlopen

import pandas as pd
import tiktoken
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from phoenix.evals import OpenAIModel, PromptTemplate
from phoenix.evals.evaluators import MapReducer, Refiner

In [None]:
url = "https://snap.stanford.edu/data/amazon/Cell_Phones_&_Accessories.txt.gz"
data = []
review_data = {}
with urlopen(url) as response:
    with gzip.open(response, "rt", encoding="utf-8") as unzipped:
        for line in unzipped:
            line = line.strip()
            if line:
                parts = line.split(": ", 1)
                key = parts[0]
                value = parts[1] if len(parts) > 1 else None
                review_data[key] = value
            else:
                if review_data:
                    data.append(review_data)
                    review_data = {}
        if review_data:
            data.append(review_data)

df = pd.DataFrame(data)
df.head()

In [None]:
df["product/productId"].value_counts()

In [None]:
target_product_id = "B0009B0IX4"
product_df = df[df["product/productId"] == target_product_id]
product_df["review/summary"].value_counts()

Gather documents into chunks.

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")


def gather_documents_into_chunks(
    documents: List[str],
    max_tokens_per_chunk: int,
    separator="\n\n======\n\n",
) -> List[str]:
    chunks = []
    current_chunk_documents = []
    current_chunk_tokens = 0
    num_tokens_in_separator = len(encoding.encode(separator))
    for document in documents:
        document_tokens = len(encoding.encode(document))
        tokens_to_add = document_tokens + (
            num_tokens_in_separator if current_chunk_documents else 0
        )
        if current_chunk_tokens + tokens_to_add <= max_tokens_per_chunk:
            current_chunk_documents.append(document)
            current_chunk_tokens += tokens_to_add
        else:
            if current_chunk_documents:
                chunks.append(separator.join(current_chunk_documents))
            current_chunk_documents = [document]
            current_chunk_tokens = document_tokens
    if current_chunk_documents:
        chunks.append(separator.join(current_chunk_documents))
    return chunks

In [None]:
documents = product_df["review/text"].sample(frac=1, random_state=0).to_list()
gpt4_context_window_in_tokens = 8192
chunks = gather_documents_into_chunks(
    documents=documents,
    max_tokens_per_chunk=(gpt4_context_window_in_tokens - 1000),  # add in a buffer
)[:3]
chunks

Summarize with a LangChain "refine" chain.

In [None]:
llm = ChatOpenAI(model="gpt-4")
chain = load_summarize_chain(llm, chain_type="refine")
documents = [Document(page_content=chunk) for chunk in chunks]
summary = chain.run(documents)
print(summary)

Evaluate the summary using `MapReducer`.

In [None]:
model = OpenAIModel(
    model_name="gpt-4",
)
map_prompt_template = PromptTemplate(
    "You will be given a CONTEXT that contains multiple documents. "
    "You will also be given a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents. "
    "You must provide an EVALUATION of the quality of the SUMMARY relative to the provided CONTEXT. "
    "Your EVALUATION should judge the quality of the SUMMARY and should concisely explain your reasoning. "
    "Bear in mind that the SUMMARY may include information from unseen documents. "
    "Focus on important points, not trivial details."
    "\n\n"
    "======="
    f"SUMMARY: {summary}"
    "\n\n"
    "======="
    "\n\n"
    "CONTEXT: {chunk}"
    "\n\n"
    "======="
    "\n\n"
    "EVALUATION: "
)
reduce_prompt_template = PromptTemplate(
    "You will be given a SUMMARY that summarizes a large number of documents. "
    "You will also be given a list of EVALUATIONS of the quality of that SUMMARY. "
    "Each evaluation judges the SUMMARY relative to a different subset of the documents it summarizes. "
    "Given this list, you must provide a single, OVERALL EVALUATION of the quality of the SUMMARY that should take into account the individual EVALUATIONS. "
    'Your OVERALL EVALUATION should judge the quality of the SUMMARY as either "good" or "bad" and should only contain one of those two words with no additional explanation.'
    "\n\n"
    "======="
    "\n\n"
    f"SUMMARY: {summary}"
    "\n\n"
    "======="
    "\n\n"
    "EVALUATIONS: {mapped}"
    "\n\n"
    "======="
    "\n\n"
    "OVERALL EVALUATION: "
)
evaluator = MapReducer(
    model=model,
    map_prompt_template=map_prompt_template,
    reduce_prompt_template=reduce_prompt_template,
)

In [None]:
summary_evaluation = evaluator.evaluate(chunks)
print(summary_evaluation)

Evaluate summary using `Refiner`.

In [None]:
model = OpenAIModel(model_name="gpt-4")
initial_prompt_template = PromptTemplate(
    "You will be given a CONTEXT that contains multiple documents. "
    "You will also be given a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents. "
    "You must provide an EVALUATION of the quality of the SUMMARY relative to the provided CONTEXT. "
    "Your EVALUATION should judge the quality of the SUMMARY and should concisely explain your reasoning. "
    "Bear in mind that the SUMMARY may include information from unseen documents. "
    "Focus on important points, not trivial details."
    "\n\n"
    "======="
    f"SUMMARY: {summary}"
    "\n\n"
    "======="
    "CONTEXT: {chunk}"
    "======="
    "EVALUATION: "
)
refine_prompt_template = PromptTemplate(
    "You will be given: \n"
    "  - a CONTEXT that contains multiple documents\n"
    "  - a SUMMARY that summarizes the documents in the CONTEXT in addition to other (unseen) documents\n"
    "  - an ACCUMULATED EVALUATION of the quality of the SUMMARY relative to other subsets of the summarized documents\n"
    "You must provide a REFINED EVALUATION of the quality of the SUMMARY that considers the current CONTEXT. "
    "Bear in mind that the SUMMARY may include information from unseen documents, although you don't need to mention explicitly mention that. "
    "Focus on important points, not trivial details."
    "\n\n"
    "======="
    f"SUMMARY: {summary}"
    "\n\n"
    "======="
    "\n\n"
    "CONTEXT: {chunk}"
    "\n\n"
    "======="
    "\n\n"
    "ACCUMULATED EVALUATION: {accumulator}"
    "\n\n"
    "======="
    "\n\n"
    "REFINED EVALUATION: "
)
synthesize_prompt_template = PromptTemplate(
    "You will be given a SUMMARY that summarizes a large number of documents. "
    "You will also be given a VERBOSE EVALUATION of the quality of that SUMMARY. "
    "Given this VERBOSE EVALUATION, you must provide a single, CONCISE EVALUATION of the quality of the SUMMARY. "
    'Your CONCISE EVALUATION should judge the quality of the SUMMARY as either "good" or "bad" and should only contain one of those two words with no additional explanation.'
    "\n\n"
    "======="
    "\n\n"
    f"SUMMARY: {summary}"
    "\n\n"
    "======="
    "\n\n"
    "VERBOSE EVALUATION: {accumulator}"
    "\n\n"
    "======="
    "\n\n"
    "CONCISE EVALUATION: "
)
evaluator = Refiner(
    model=model,
    initial_prompt_template=initial_prompt_template,
    refine_prompt_template=refine_prompt_template,
    synthesize_prompt_template=synthesize_prompt_template,
)

In [None]:
summary_evaluation = evaluator.evaluate(chunks)
print(summary_evaluation)